diff --git a/lucene/core/src/java/org/apache/lucene/search/BooleanQuery.java b/lucene/core/src/java/org/apache/lucene/search/BooleanQuery.java index d0b9b5c..55850db 100644 --- a/lucene/core/src/java/org/apache/lucene/search/BooleanQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/BooleanQuery.java @@ -17,14 +17,7 @@ package org.apache.lucene.search; * limitations under the License. */ -import java.io.IOException; -import java.util.*; - -import org.apache.lucene.index.AtomicReaderContext; -import org.apache.lucene.index.DocsEnum; -import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.Term; -import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.index.*; import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.search.ConjunctionTermScorer.DocsAndFreqs; import org.apache.lucene.search.TermQuery.TermWeight; @@ -33,6 +26,12 @@ import org.apache.lucene.search.similarities.Similarity.ExactSimScorer; import org.apache.lucene.util.Bits; import org.apache.lucene.util.ToStringUtils; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.Set; + /** A Query that matches documents matching boolean combinations of other * queries, e.g. {@link TermQuery}s, {@link PhraseQuery}s or other * BooleanQuerys. @@ -244,7 +243,7 @@ public class BooleanQuery extends Query implements Iterable { for (Iterator wIter = weights.iterator(); wIter.hasNext();) { Weight w = wIter.next(); BooleanClause c = cIter.next(); - if (w.scorer(context, true, true, context.reader().getLiveDocs()) == null) { + if (w.scorer(context, true, true, PostingFeatures.DOCS_AND_FREQS, context.reader().getLiveDocs()) == null) { if (c.isRequired()) { fail = true; Explanation r = new Explanation(0.0f, "no match on required clause (" + c.getQuery().toString() + ")"); @@ -307,11 +306,11 @@ public class BooleanQuery extends Query implements Iterable { @Override public Scorer scorer(AtomicReaderContext context, boolean scoreDocsInOrder, - boolean topScorer, Bits acceptDocs) + boolean topScorer, PostingFeatures flags, Bits acceptDocs) throws IOException { if (termConjunction) { // specialized scorer for term conjunctions - return createConjunctionTermScorer(context, acceptDocs); + return createConjunctionTermScorer(context, acceptDocs, flags); } List required = new ArrayList(); List prohibited = new ArrayList(); @@ -319,7 +318,7 @@ public class BooleanQuery extends Query implements Iterable { Iterator cIter = clauses.iterator(); for (Weight w : weights) { BooleanClause c = cIter.next(); - Scorer subScorer = w.scorer(context, true, false, acceptDocs); + Scorer subScorer = w.scorer(context, true, false, flags, acceptDocs); if (subScorer == null) { if (c.isRequired()) { return null; @@ -341,7 +340,7 @@ public class BooleanQuery extends Query implements Iterable { // return BooleanScorer for topScorer): // Check if we can return a BooleanScorer - if (!scoreDocsInOrder && topScorer && required.size() == 0) { + if (!scoreDocsInOrder && flags == PostingFeatures.DOCS_AND_FREQS && topScorer && required.size() == 0) { return new BooleanScorer(this, disableCoord, minNrShouldMatch, optional, prohibited, maxCoord); } @@ -359,7 +358,7 @@ public class BooleanQuery extends Query implements Iterable { return new BooleanScorer2(this, disableCoord, minNrShouldMatch, required, prohibited, optional, maxCoord); } - private Scorer createConjunctionTermScorer(AtomicReaderContext context, Bits acceptDocs) + private Scorer createConjunctionTermScorer(AtomicReaderContext context, Bits acceptDocs, PostingFeatures flags) throws IOException { // TODO: fix scorer API to specify "needsScores" up @@ -369,7 +368,7 @@ public class BooleanQuery extends Query implements Iterable { final DocsAndFreqs[] docsAndFreqs = new DocsAndFreqs[weights.size()]; for (int i = 0; i < docsAndFreqs.length; i++) { final TermWeight weight = (TermWeight) weights.get(i); - final Scorer scorer = weight.scorer(context, true, false, acceptDocs); + final Scorer scorer = weight.scorer(context, true, false, flags, acceptDocs); if (scorer == null) { return null; } else { diff --git a/lucene/core/src/java/org/apache/lucene/search/BooleanScorer.java b/lucene/core/src/java/org/apache/lucene/search/BooleanScorer.java index fcba747..c933109 100644 --- a/lucene/core/src/java/org/apache/lucene/search/BooleanScorer.java +++ b/lucene/core/src/java/org/apache/lucene/search/BooleanScorer.java @@ -17,14 +17,14 @@ package org.apache.lucene.search; * limitations under the License. */ +import org.apache.lucene.index.AtomicReaderContext; +import org.apache.lucene.search.BooleanQuery.BooleanWeight; +import org.apache.lucene.search.intervals.IntervalIterator; + import java.io.IOException; -import java.util.ArrayList; import java.util.Collection; import java.util.List; -import org.apache.lucene.index.AtomicReaderContext; -import org.apache.lucene.search.BooleanQuery.BooleanWeight; - /* Description from Doug Cutting (excerpted from * LUCENE-1483): * @@ -134,6 +134,11 @@ final class BooleanScorer extends Scorer { @Override public float score() { return (float)score; } + + @Override + public IntervalIterator intervals(boolean collectIntervals) throws IOException { + throw new UnsupportedOperationException("Positions are not supported on out of order collections"); + } } @@ -322,6 +327,11 @@ final class BooleanScorer extends Scorer { } @Override + public IntervalIterator intervals(boolean collectIntervals) throws IOException { + throw new UnsupportedOperationException("intervals are not available if docs are matched out of order"); + } + + @Override public float freq() throws IOException { throw new UnsupportedOperationException(); } diff --git a/lucene/core/src/java/org/apache/lucene/search/BooleanScorer2.java b/lucene/core/src/java/org/apache/lucene/search/BooleanScorer2.java index c946a89..b6db0ea 100644 --- a/lucene/core/src/java/org/apache/lucene/search/BooleanScorer2.java +++ b/lucene/core/src/java/org/apache/lucene/search/BooleanScorer2.java @@ -24,6 +24,7 @@ import java.util.List; import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.search.BooleanQuery.BooleanWeight; +import org.apache.lucene.search.intervals.IntervalIterator; import org.apache.lucene.search.similarities.Similarity; /* See the description in BooleanScorer.java, comparing @@ -147,6 +148,11 @@ class BooleanScorer2 extends Scorer { public int advance(int target) throws IOException { return scorer.advance(target); } + + @Override + public IntervalIterator intervals(boolean collectIntervals) throws IOException { + return scorer.intervals(collectIntervals); + } } private Scorer countingDisjunctionSumScorer(final List scorers, @@ -243,7 +249,7 @@ class BooleanScorer2 extends Scorer { if (minNrShouldMatch > 0) { // use a required disjunction scorer over the optional scorers return addProhibitedScorers( dualConjunctionSumScorer( // non counting - disableCoord, + disableCoord, requiredCountingSumScorer, countingDisjunctionSumScorer( optionalScorers, @@ -278,7 +284,7 @@ class BooleanScorer2 extends Scorer { */ @Override public void score(Collector collector) throws IOException { - collector.setScorer(this); + collector.setScorer(this); while ((doc = countingSumScorer.nextDoc()) != NO_MORE_DOCS) { collector.collect(doc); } @@ -321,6 +327,11 @@ class BooleanScorer2 extends Scorer { public int advance(int target) throws IOException { return doc = countingSumScorer.advance(target); } + + @Override + public IntervalIterator intervals(boolean collectIntervals) throws IOException { + return countingSumScorer.intervals(collectIntervals); + } @Override public Collection getChildren() { diff --git a/lucene/core/src/java/org/apache/lucene/search/CachingCollector.java b/lucene/core/src/java/org/apache/lucene/search/CachingCollector.java index 19996a3..7bb21c6 100644 --- a/lucene/core/src/java/org/apache/lucene/search/CachingCollector.java +++ b/lucene/core/src/java/org/apache/lucene/search/CachingCollector.java @@ -18,6 +18,7 @@ package org.apache.lucene.search; */ import org.apache.lucene.index.AtomicReaderContext; +import org.apache.lucene.search.intervals.IntervalIterator; import org.apache.lucene.util.RamUsageEstimator; import java.io.IOException; @@ -89,6 +90,9 @@ public abstract class CachingCollector extends Collector { @Override public final int nextDoc() { throw new UnsupportedOperationException(); } + + @Override + public IntervalIterator intervals(boolean collectIntervals) throws IOException { throw new UnsupportedOperationException(); } } // A CachingCollector which caches scores diff --git a/lucene/core/src/java/org/apache/lucene/search/Collector.java b/lucene/core/src/java/org/apache/lucene/search/Collector.java index 1d4121c..4446905 100644 --- a/lucene/core/src/java/org/apache/lucene/search/Collector.java +++ b/lucene/core/src/java/org/apache/lucene/search/Collector.java @@ -21,6 +21,7 @@ import java.io.IOException; import org.apache.lucene.index.AtomicReaderContext; import org.apache.lucene.index.IndexReaderContext; +import org.apache.lucene.search.Weight.PostingFeatures; /** *

Expert: Collectors are primarily meant to be used to @@ -173,4 +174,12 @@ public abstract class Collector { */ public abstract boolean acceptsDocsOutOfOrder(); + /** + * Returns the posting features required by this collector. Default value is + * {@link PostingFeatures#DOCS_AND_FREQS}. + */ + public PostingFeatures postingFeatures() { + return PostingFeatures.DOCS_AND_FREQS; + } + } diff --git a/lucene/core/src/java/org/apache/lucene/search/ConjunctionScorer.java b/lucene/core/src/java/org/apache/lucene/search/ConjunctionScorer.java index 2f37693..ec31386 100644 --- a/lucene/core/src/java/org/apache/lucene/search/ConjunctionScorer.java +++ b/lucene/core/src/java/org/apache/lucene/search/ConjunctionScorer.java @@ -17,24 +17,30 @@ package org.apache.lucene.search; * limitations under the License. */ -import org.apache.lucene.util.ArrayUtil; import java.io.IOException; import java.util.ArrayList; import java.util.Collection; import java.util.Comparator; +import org.apache.lucene.search.intervals.ConjunctionIntervalIterator; +import org.apache.lucene.search.intervals.IntervalIterator; +import org.apache.lucene.util.ArrayUtil; + /** Scorer for conjunctions, sets of queries, all of which are required. */ class ConjunctionScorer extends Scorer { + private final Scorer[] scorersOrdered; private final Scorer[] scorers; private int lastDoc = -1; public ConjunctionScorer(Weight weight, Collection scorers) throws IOException { this(weight, scorers.toArray(new Scorer[scorers.size()])); } - + public ConjunctionScorer(Weight weight, Scorer... scorers) throws IOException { super(weight); + scorersOrdered = new Scorer[scorers.length]; + System.arraycopy(scorers, 0, scorersOrdered, 0, scorers.length); this.scorers = scorers; for (int i = 0; i < scorers.length; i++) { @@ -136,6 +142,16 @@ class ConjunctionScorer extends Scorer { } return sum; } + + @Override + public IntervalIterator intervals(boolean collectIntervals) throws IOException { + if (scorersOrdered == null) { + throw new IllegalStateException("no positions requested for this scorer"); + } + // only created if needed for this scorer - no penalty for non-positional queries + return new ConjunctionIntervalIterator(this, collectIntervals, pullIterators(collectIntervals, scorersOrdered)); + } + @Override public float freq() throws IOException { diff --git a/lucene/core/src/java/org/apache/lucene/search/ConjunctionTermScorer.java b/lucene/core/src/java/org/apache/lucene/search/ConjunctionTermScorer.java index bf547bd..156af93 100644 --- a/lucene/core/src/java/org/apache/lucene/search/ConjunctionTermScorer.java +++ b/lucene/core/src/java/org/apache/lucene/search/ConjunctionTermScorer.java @@ -22,21 +22,28 @@ import java.util.ArrayList; import java.util.Collection; import java.util.Comparator; +import org.apache.lucene.index.DocsAndPositionsEnum; import org.apache.lucene.index.DocsEnum; +import org.apache.lucene.search.intervals.ConjunctionIntervalIterator; +import org.apache.lucene.search.intervals.IntervalIterator; +import org.apache.lucene.search.intervals.TermIntervalIterator; import org.apache.lucene.util.ArrayUtil; + /** Scorer for conjunctions, sets of terms, all of which are required. */ class ConjunctionTermScorer extends Scorer { protected final float coord; protected int lastDoc = -1; protected final DocsAndFreqs[] docsAndFreqs; private final DocsAndFreqs lead; + private DocsAndFreqs[] origDocsAndFreqs; - ConjunctionTermScorer(Weight weight, float coord, - DocsAndFreqs[] docsAndFreqs) { + ConjunctionTermScorer(Weight weight, float coord, DocsAndFreqs[] docsAndFreqs) { super(weight); this.coord = coord; this.docsAndFreqs = docsAndFreqs; + this.origDocsAndFreqs = new DocsAndFreqs[docsAndFreqs.length]; + System.arraycopy(docsAndFreqs, 0,origDocsAndFreqs, 0, docsAndFreqs.length); // Sort the array the first time to allow the least frequent DocsEnum to // lead the matching. ArrayUtil.mergeSort(docsAndFreqs, new Comparator() { @@ -128,4 +135,17 @@ class ConjunctionTermScorer extends Scorer { this.scorer = scorer; } } + + @Override + public IntervalIterator intervals(boolean collectIntervals) throws IOException { + + TermIntervalIterator[] positionIters = new TermIntervalIterator[origDocsAndFreqs.length]; + for (int i = 0; i < positionIters.length; i++) { + DocsAndFreqs d = origDocsAndFreqs[i]; + assert d.docs instanceof DocsAndPositionsEnum; + positionIters[i] = new TermIntervalIterator(this, (DocsAndPositionsEnum)d.docs, false, collectIntervals); + } + return new ConjunctionIntervalIterator(this, collectIntervals, positionIters); + } + } diff --git a/lucene/core/src/java/org/apache/lucene/search/ConstantScoreQuery.java b/lucene/core/src/java/org/apache/lucene/search/ConstantScoreQuery.java index 9da1ddd..2c8be72 100644 --- a/lucene/core/src/java/org/apache/lucene/search/ConstantScoreQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/ConstantScoreQuery.java @@ -20,6 +20,8 @@ package org.apache.lucene.search; import org.apache.lucene.index.AtomicReaderContext; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; +import org.apache.lucene.search.Weight.PostingFeatures; +import org.apache.lucene.search.intervals.IntervalIterator; import org.apache.lucene.util.Bits; import org.apache.lucene.util.ToStringUtils; @@ -122,7 +124,7 @@ public class ConstantScoreQuery extends Query { @Override public Scorer scorer(AtomicReaderContext context, boolean scoreDocsInOrder, - boolean topScorer, final Bits acceptDocs) throws IOException { + boolean topScorer, PostingFeatures flags, final Bits acceptDocs) throws IOException { final DocIdSetIterator disi; if (filter != null) { assert query == null; @@ -133,7 +135,7 @@ public class ConstantScoreQuery extends Query { disi = dis.iterator(); } else { assert query != null && innerWeight != null; - disi = innerWeight.scorer(context, scoreDocsInOrder, topScorer, acceptDocs); + disi = innerWeight.scorer(context, scoreDocsInOrder, topScorer, flags, acceptDocs); } if (disi == null) { @@ -149,7 +151,7 @@ public class ConstantScoreQuery extends Query { @Override public Explanation explain(AtomicReaderContext context, int doc) throws IOException { - final Scorer cs = scorer(context, true, false, context.reader().getLiveDocs()); + final Scorer cs = scorer(context, true, false, PostingFeatures.DOCS_AND_FREQS, context.reader().getLiveDocs()); final boolean exists = (cs != null && cs.advance(doc) == doc); final ComplexExplanation result = new ComplexExplanation(); @@ -204,6 +206,7 @@ public class ConstantScoreQuery extends Query { return docIdSetIterator.advance(target); } + private Collector wrapCollector(final Collector collector) { return new Collector() { @Override @@ -223,6 +226,11 @@ public class ConstantScoreQuery extends Query { } @Override + public PostingFeatures postingFeatures() { + return collector.postingFeatures(); + } + + @Override public boolean acceptsDocsOutOfOrder() { return collector.acceptsDocsOutOfOrder(); } @@ -248,6 +256,15 @@ public class ConstantScoreQuery extends Query { return super.score(collector, max, firstDocID); } } + + @Override + public IntervalIterator intervals(boolean collectIntervals) throws IOException { + if (docIdSetIterator instanceof Scorer) { + return ((Scorer) docIdSetIterator).intervals(collectIntervals); + } else { + throw new UnsupportedOperationException("positions are only supported on Scorer subclasses"); + } + } } @Override diff --git a/lucene/core/src/java/org/apache/lucene/search/DisjunctionMaxQuery.java b/lucene/core/src/java/org/apache/lucene/search/DisjunctionMaxQuery.java index ed1e26b..cd3d693 100644 --- a/lucene/core/src/java/org/apache/lucene/search/DisjunctionMaxQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/DisjunctionMaxQuery.java @@ -152,12 +152,12 @@ public class DisjunctionMaxQuery extends Query implements Iterable { /** Create the scorer used to score our associated DisjunctionMaxQuery */ @Override public Scorer scorer(AtomicReaderContext context, boolean scoreDocsInOrder, - boolean topScorer, Bits acceptDocs) throws IOException { + boolean topScorer, PostingFeatures flags, Bits acceptDocs) throws IOException { Scorer[] scorers = new Scorer[weights.size()]; int idx = 0; for (Weight w : weights) { // we will advance() subscorers - Scorer subScorer = w.scorer(context, true, false, acceptDocs); + Scorer subScorer = w.scorer(context, true, false, flags, acceptDocs); if (subScorer != null) { scorers[idx++] = subScorer; } diff --git a/lucene/core/src/java/org/apache/lucene/search/DisjunctionMaxScorer.java b/lucene/core/src/java/org/apache/lucene/search/DisjunctionMaxScorer.java index c5c7327..1c7a64d 100644 --- a/lucene/core/src/java/org/apache/lucene/search/DisjunctionMaxScorer.java +++ b/lucene/core/src/java/org/apache/lucene/search/DisjunctionMaxScorer.java @@ -16,6 +16,9 @@ package org.apache.lucene.search; * limitations under the License. */ +import org.apache.lucene.search.intervals.DisjunctionIntervalIterator; +import org.apache.lucene.search.intervals.IntervalIterator; + import java.io.IOException; /** @@ -51,6 +54,7 @@ class DisjunctionMaxScorer extends DisjunctionScorer { Scorer[] subScorers, int numScorers) { super(weight, subScorers, numScorers); this.tieBreakerMultiplier = tieBreakerMultiplier; + } @Override @@ -132,4 +136,9 @@ class DisjunctionMaxScorer extends DisjunctionScorer { } return doc = subScorers[0].docID(); } + + @Override + public IntervalIterator intervals(boolean collectIntervals) throws IOException { + return new DisjunctionIntervalIterator(this, collectIntervals, pullIterators(collectIntervals, subScorers)); + } } diff --git a/lucene/core/src/java/org/apache/lucene/search/DisjunctionSumScorer.java b/lucene/core/src/java/org/apache/lucene/search/DisjunctionSumScorer.java index c3d32b1..6337c97 100644 --- a/lucene/core/src/java/org/apache/lucene/search/DisjunctionSumScorer.java +++ b/lucene/core/src/java/org/apache/lucene/search/DisjunctionSumScorer.java @@ -20,6 +20,10 @@ package org.apache.lucene.search; import java.util.List; import java.io.IOException; +import org.apache.lucene.search.intervals.ConjunctionIntervalIterator; +import org.apache.lucene.search.intervals.DisjunctionIntervalIterator; +import org.apache.lucene.search.intervals.IntervalIterator; + /** A Scorer for OR like queries, counterpart of ConjunctionScorer. * This Scorer implements {@link Scorer#advance(int)} and uses advance() on the given Scorers. */ @@ -34,7 +38,7 @@ class DisjunctionSumScorer extends DisjunctionScorer { protected int nrMatchers = -1; private double score = Float.NaN; - + /** Construct a DisjunctionScorer. * @param weight The weight to be used. * @param subScorers A collection of at least two subscorers. @@ -48,7 +52,6 @@ class DisjunctionSumScorer extends DisjunctionScorer { */ public DisjunctionSumScorer(Weight weight, List subScorers, int minimumNrMatchers) throws IOException { super(weight, subScorers.toArray(new Scorer[subScorers.size()]), subScorers.size()); - if (minimumNrMatchers <= 0) { throw new IllegalArgumentException("Minimum nr of matchers must be positive"); } @@ -67,6 +70,7 @@ class DisjunctionSumScorer extends DisjunctionScorer { } @Override + public int nextDoc() throws IOException { assert doc != NO_MORE_DOCS; while(true) { @@ -166,4 +170,13 @@ class DisjunctionSumScorer extends DisjunctionScorer { return nextDoc(); } } + + @Override + public IntervalIterator intervals(boolean collectIntervals) throws IOException { + if (minimumNrMatchers > 1) { + return new ConjunctionIntervalIterator(this, + collectIntervals, minimumNrMatchers, pullIterators(collectIntervals, subScorers)); + } + return new DisjunctionIntervalIterator(this, collectIntervals, pullIterators(collectIntervals, subScorers)); + } } diff --git a/lucene/core/src/java/org/apache/lucene/search/ExactPhraseScorer.java b/lucene/core/src/java/org/apache/lucene/search/ExactPhraseScorer.java index da4afd5..f5f59a1 100644 --- a/lucene/core/src/java/org/apache/lucene/search/ExactPhraseScorer.java +++ b/lucene/core/src/java/org/apache/lucene/search/ExactPhraseScorer.java @@ -17,24 +17,29 @@ package org.apache.lucene.search; * limitations under the License. */ +import org.apache.lucene.index.DocsAndPositionsEnum; +import org.apache.lucene.search.PhraseQuery.TermDocsEnumFactory; +import org.apache.lucene.search.intervals.BlockIntervalIterator; +import org.apache.lucene.search.intervals.IntervalIterator; +import org.apache.lucene.search.intervals.TermIntervalIterator; +import org.apache.lucene.search.similarities.Similarity; + import java.io.IOException; import java.util.Arrays; -import org.apache.lucene.index.*; -import org.apache.lucene.search.similarities.Similarity; - final class ExactPhraseScorer extends Scorer { private final int endMinus1; - + private final static int CHUNK = 4096; - + private int gen; private final int[] counts = new int[CHUNK]; private final int[] gens = new int[CHUNK]; - + boolean noDocs; - + private final static class ChunkState { + final TermDocsEnumFactory factory; final DocsAndPositionsEnum posEnum; final int offset; final boolean useAdvance; @@ -42,61 +47,65 @@ final class ExactPhraseScorer extends Scorer { int posLimit; int pos; int lastPos; - - public ChunkState(DocsAndPositionsEnum posEnum, int offset, boolean useAdvance) { + + public ChunkState(TermDocsEnumFactory factory, DocsAndPositionsEnum posEnum, int offset, + boolean useAdvance) throws IOException { + this.factory = factory; this.posEnum = posEnum; this.offset = offset; this.useAdvance = useAdvance; } } - + private final ChunkState[] chunkStates; - + private int docID = -1; private int freq; - - private final Similarity.ExactSimScorer docScorer; + private final Similarity.ExactSimScorer docScorer; + ExactPhraseScorer(Weight weight, PhraseQuery.PostingsAndFreq[] postings, - Similarity.ExactSimScorer docScorer) throws IOException { + Similarity.ExactSimScorer docScorer) throws IOException { super(weight); this.docScorer = docScorer; - + chunkStates = new ChunkState[postings.length]; - - endMinus1 = postings.length-1; - - for(int i=0;i 1/5th) rarer than // the first term, then we just use .nextDoc() when - // ANDing. This buys ~15% gain for phrases where + // ANDing. This buys ~15% gain for phrases where // freq of rarest 2 terms is close: - final boolean useAdvance = postings[i].docFreq > 5*postings[0].docFreq; - chunkStates[i] = new ChunkState(postings[i].postings, -postings[i].position, useAdvance); - if (i > 0 && postings[i].postings.nextDoc() == DocIdSetIterator.NO_MORE_DOCS) { + final boolean useAdvance = postings[i].docFreq > 5 * postings[0].docFreq; + chunkStates[i] = new ChunkState(postings[i].factory, postings[i].postings, + -postings[i].position, useAdvance); + if (i > 0 + && postings[i].postings.nextDoc() == DocIdSetIterator.NO_MORE_DOCS) { noDocs = true; return; } } } - + @Override public int nextDoc() throws IOException { - while(true) { - + while (true) { + // first (rarest) term final int doc = chunkStates[0].posEnum.nextDoc(); if (doc == DocIdSetIterator.NO_MORE_DOCS) { docID = doc; return doc; } - + // not-first terms int i = 1; - while(i < chunkStates.length) { + while (i < chunkStates.length) { final ChunkState cs = chunkStates[i]; int doc2 = cs.posEnum.docID(); if (cs.useAdvance) { @@ -105,7 +114,7 @@ final class ExactPhraseScorer extends Scorer { } } else { int iter = 0; - while(doc2 < doc) { + while (doc2 < doc) { // safety net -- fallback to .advance if we've // done too many .nextDocs if (++iter == 50) { @@ -121,12 +130,12 @@ final class ExactPhraseScorer extends Scorer { } i++; } - + if (i == chunkStates.length) { // this doc has all the terms -- now test whether // phrase occurs docID = doc; - + freq = phraseFreq(); if (freq != 0) { return docID; @@ -134,22 +143,22 @@ final class ExactPhraseScorer extends Scorer { } } } - + @Override public int advance(int target) throws IOException { - + // first term int doc = chunkStates[0].posEnum.advance(target); if (doc == DocIdSetIterator.NO_MORE_DOCS) { docID = DocIdSetIterator.NO_MORE_DOCS; return doc; } - - while(true) { + + while (true) { // not-first terms int i = 1; - while(i < chunkStates.length) { + while (i < chunkStates.length) { int doc2 = chunkStates[i].posEnum.docID(); if (doc2 < doc) { doc2 = chunkStates[i].posEnum.advance(doc); @@ -159,7 +168,7 @@ final class ExactPhraseScorer extends Scorer { } i++; } - + if (i == chunkStates.length) { // this doc has all the terms -- now test whether // phrase occurs @@ -169,7 +178,7 @@ final class ExactPhraseScorer extends Scorer { return docID; } } - + doc = chunkStates[0].posEnum.nextDoc(); if (doc == DocIdSetIterator.NO_MORE_DOCS) { docID = doc; @@ -177,63 +186,63 @@ final class ExactPhraseScorer extends Scorer { } } } - + @Override public String toString() { return "ExactPhraseScorer(" + weight + ")"; } - + @Override public float freq() { return freq; } - + @Override public int docID() { return docID; } - + @Override public float score() { return docScorer.score(docID, freq); } - + private int phraseFreq() throws IOException { - + freq = 0; - + // init chunks - for(int i=0;i cs.lastPos) { cs.lastPos = cs.pos; final int posIndex = cs.pos - chunkStart; @@ -241,7 +250,7 @@ final class ExactPhraseScorer extends Scorer { assert gens[posIndex] != gen; gens[posIndex] = gen; } - + if (cs.posUpto == cs.posLimit) { end = true; break; @@ -250,13 +259,13 @@ final class ExactPhraseScorer extends Scorer { cs.pos = cs.offset + cs.posEnum.nextPosition(); } } - + // middle terms boolean any = true; - for(int t=1;t cs.lastPos) { cs.lastPos = cs.pos; final int posIndex = cs.pos - chunkStart; @@ -266,7 +275,7 @@ final class ExactPhraseScorer extends Scorer { any = true; } } - + if (cs.posUpto == cs.posLimit) { end = true; break; @@ -274,32 +283,33 @@ final class ExactPhraseScorer extends Scorer { cs.posUpto++; cs.pos = cs.offset + cs.posEnum.nextPosition(); } - + if (!any) { break; } } - + if (!any) { // petered out for this chunk chunkStart += CHUNK; chunkEnd += CHUNK; continue; } - + // last term - + { final ChunkState cs = chunkStates[endMinus1]; - while(cs.pos < chunkEnd) { + while (cs.pos < chunkEnd) { if (cs.pos > cs.lastPos) { cs.lastPos = cs.pos; final int posIndex = cs.pos - chunkStart; - if (posIndex >= 0 && gens[posIndex] == gen && counts[posIndex] == endMinus1) { + if (posIndex >= 0 && gens[posIndex] == gen + && counts[posIndex] == endMinus1) { freq++; } } - + if (cs.posUpto == cs.posLimit) { end = true; break; @@ -308,11 +318,21 @@ final class ExactPhraseScorer extends Scorer { cs.pos = cs.offset + cs.posEnum.nextPosition(); } } - + chunkStart += CHUNK; chunkEnd += CHUNK; } - + return freq; } + + @Override + public IntervalIterator intervals(boolean collectIntervals) throws IOException { + TermIntervalIterator[] posIters = new TermIntervalIterator[chunkStates.length]; + DocsAndPositionsEnum[] enums = new DocsAndPositionsEnum[chunkStates.length]; + for (int i = 0; i < chunkStates.length; i++) { + posIters[i] = new TermIntervalIterator(this, enums[i] = chunkStates[i].factory.docsAndPositionsEnum(), false, collectIntervals); + } + return new PhraseScorer.AdvancingIntervalIterator(this, collectIntervals, enums, new BlockIntervalIterator(this, collectIntervals, posIters)); + } } diff --git a/lucene/core/src/java/org/apache/lucene/search/FilteredQuery.java b/lucene/core/src/java/org/apache/lucene/search/FilteredQuery.java index eceb0ff..8cec7e8 100644 --- a/lucene/core/src/java/org/apache/lucene/search/FilteredQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/FilteredQuery.java @@ -20,6 +20,8 @@ package org.apache.lucene.search; import org.apache.lucene.index.AtomicReaderContext; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; +import org.apache.lucene.search.Weight.PostingFeatures; +import org.apache.lucene.search.intervals.IntervalIterator; import org.apache.lucene.util.Bits; import org.apache.lucene.util.ToStringUtils; @@ -122,7 +124,7 @@ public class FilteredQuery extends Query { // return a filtering scorer @Override - public Scorer scorer(AtomicReaderContext context, boolean scoreDocsInOrder, boolean topScorer, final Bits acceptDocs) throws IOException { + public Scorer scorer(AtomicReaderContext context, boolean scoreDocsInOrder, boolean topScorer, PostingFeatures flags, Bits acceptDocs) throws IOException { assert filter != null; final DocIdSet filterDocIdSet = filter.getDocIdSet(context, acceptDocs); @@ -130,7 +132,7 @@ public class FilteredQuery extends Query { // this means the filter does not accept any documents. return null; } - return strategy.filteredScorer(context, scoreDocsInOrder, topScorer, weight, filterDocIdSet); + return strategy.filteredScorer(context, scoreDocsInOrder, topScorer, weight, filterDocIdSet, flags); } }; @@ -192,7 +194,6 @@ public class FilteredQuery extends Query { } } - @Override public int docID() { return scorerDoc; @@ -210,6 +211,12 @@ public class FilteredQuery extends Query { public Collection getChildren() { return Collections.singleton(new ChildScorer(scorer, "FILTERED")); } + + @Override + public IntervalIterator intervals(boolean collectIntervals) + throws IOException { + return scorer.intervals(collectIntervals); + } } /** @@ -304,6 +311,12 @@ public class FilteredQuery extends Query { public final Collection getChildren() { return Collections.singleton(new ChildScorer(scorer, "FILTERED")); } + + @Override + public IntervalIterator intervals(boolean collectIntervals) + throws IOException { + return scorer.intervals(collectIntervals); + } } // TODO once we have way to figure out if we use RA or LeapFrog we can remove this scorer @@ -478,13 +491,14 @@ public class FilteredQuery extends Query { * be called. * @param weight the {@link FilteredQuery} {@link Weight} to create the filtered scorer. * @param docIdSet the filter {@link DocIdSet} to apply + * @param flags the low level {@link PostingFeatures} for this scorer. * @return a filtered scorer * * @throws IOException if an {@link IOException} occurs */ public abstract Scorer filteredScorer(AtomicReaderContext context, boolean scoreDocsInOrder, boolean topScorer, Weight weight, - DocIdSet docIdSet) throws IOException; + DocIdSet docIdSet, PostingFeatures flags) throws IOException; } /** @@ -498,7 +512,7 @@ public class FilteredQuery extends Query { public static class RandomAccessFilterStrategy extends FilterStrategy { @Override - public Scorer filteredScorer(AtomicReaderContext context, boolean scoreDocsInOrder, boolean topScorer, Weight weight, DocIdSet docIdSet) throws IOException { + public Scorer filteredScorer(AtomicReaderContext context, boolean scoreDocsInOrder, boolean topScorer, Weight weight, DocIdSet docIdSet, PostingFeatures flags) throws IOException { final DocIdSetIterator filterIter = docIdSet.iterator(); if (filterIter == null) { // this means the filter does not accept any documents. @@ -515,12 +529,12 @@ public class FilteredQuery extends Query { final boolean useRandomAccess = (filterAcceptDocs != null && (useRandomAccess(filterAcceptDocs, firstFilterDoc))); if (useRandomAccess) { // if we are using random access, we return the inner scorer, just with other acceptDocs - return weight.scorer(context, scoreDocsInOrder, topScorer, filterAcceptDocs); + return weight.scorer(context, scoreDocsInOrder, topScorer, flags, filterAcceptDocs); } else { assert firstFilterDoc > -1; // we are gonna advance() this scorer, so we set inorder=true/toplevel=false // we pass null as acceptDocs, as our filter has already respected acceptDocs, no need to do twice - final Scorer scorer = weight.scorer(context, true, false, null); + final Scorer scorer = weight.scorer(context, true, false, flags, null); // TODO once we have way to figure out if we use RA or LeapFrog we can remove this scorer return (scorer == null) ? null : new PrimaryAdvancedLeapFrogScorer(weight, firstFilterDoc, filterIter, scorer); } @@ -554,7 +568,7 @@ public class FilteredQuery extends Query { @Override public Scorer filteredScorer(AtomicReaderContext context, boolean scoreDocsInOrder, boolean topScorer, Weight weight, - DocIdSet docIdSet) throws IOException { + DocIdSet docIdSet, PostingFeatures flags) throws IOException { final DocIdSetIterator filterIter = docIdSet.iterator(); if (filterIter == null) { // this means the filter does not accept any documents. @@ -562,7 +576,7 @@ public class FilteredQuery extends Query { } // we are gonna advance() this scorer, so we set inorder=true/toplevel=false // we pass null as acceptDocs, as our filter has already respected acceptDocs, no need to do twice - final Scorer scorer = weight.scorer(context, true, false, null); + final Scorer scorer = weight.scorer(context, true, false, flags, null); if (scorerFirst) { return (scorer == null) ? null : new LeapFrogScorer(weight, scorer, filterIter, scorer); } else { @@ -588,13 +602,13 @@ public class FilteredQuery extends Query { private static final class QueryFirstFilterStrategy extends FilterStrategy { @Override public Scorer filteredScorer(final AtomicReaderContext context, - boolean scoreDocsInOrder, boolean topScorer, Weight weight, - DocIdSet docIdSet) throws IOException { + boolean scoreDocsInOrder, boolean topScorer, Weight weight, + DocIdSet docIdSet, PostingFeatures flags) throws IOException { Bits filterAcceptDocs = docIdSet.bits(); if (filterAcceptDocs == null) { - return LEAP_FROG_QUERY_FIRST_STRATEGY.filteredScorer(context, scoreDocsInOrder, topScorer, weight, docIdSet); + return LEAP_FROG_QUERY_FIRST_STRATEGY.filteredScorer(context, scoreDocsInOrder, topScorer, weight, docIdSet, flags); } - final Scorer scorer = weight.scorer(context, true, false, null); + final Scorer scorer = weight.scorer(context, true, false, flags, null); return scorer == null ? null : new QueryFirstScorer(weight, filterAcceptDocs, scorer); } diff --git a/lucene/core/src/java/org/apache/lucene/search/IndexSearcher.java b/lucene/core/src/java/org/apache/lucene/search/IndexSearcher.java index 5626b25..50dc630 100644 --- a/lucene/core/src/java/org/apache/lucene/search/IndexSearcher.java +++ b/lucene/core/src/java/org/apache/lucene/search/IndexSearcher.java @@ -44,6 +44,7 @@ import org.apache.lucene.index.StoredFieldVisitor; import org.apache.lucene.index.Term; import org.apache.lucene.index.TermContext; import org.apache.lucene.index.Terms; +import org.apache.lucene.search.intervals.IntervalIterator; import org.apache.lucene.search.similarities.DefaultSimilarity; import org.apache.lucene.search.similarities.Similarity; import org.apache.lucene.store.NIOFSDirectory; // javadoc @@ -584,7 +585,7 @@ public class IndexSearcher { // always use single thread: for (AtomicReaderContext ctx : leaves) { // search each subreader collector.setNextReader(ctx); - Scorer scorer = weight.scorer(ctx, !collector.acceptsDocsOutOfOrder(), true, ctx.reader().getLiveDocs()); + Scorer scorer = weight.scorer(ctx, !collector.acceptsDocsOutOfOrder(), true, collector.postingFeatures(), ctx.reader().getLiveDocs()); if (scorer != null) { scorer.score(collector); } @@ -771,6 +772,11 @@ public class IndexSearcher { public float score() { return score; } + + @Override + public IntervalIterator intervals(boolean collectIntervals) throws IOException { + return null; + } } private final FakeScorer fakeScorer = new FakeScorer(); diff --git a/lucene/core/src/java/org/apache/lucene/search/MatchAllDocsQuery.java b/lucene/core/src/java/org/apache/lucene/search/MatchAllDocsQuery.java index 5844c93..79c811c 100644 --- a/lucene/core/src/java/org/apache/lucene/search/MatchAllDocsQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/MatchAllDocsQuery.java @@ -20,6 +20,7 @@ package org.apache.lucene.search; import org.apache.lucene.index.AtomicReaderContext; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; +import org.apache.lucene.search.intervals.IntervalIterator; import org.apache.lucene.util.ToStringUtils; import org.apache.lucene.util.Bits; @@ -77,6 +78,11 @@ public class MatchAllDocsQuery extends Query { doc = target-1; return nextDoc(); } + + @Override + public IntervalIterator intervals(boolean collectIntervals) throws IOException { + throw new UnsupportedOperationException("MatchAllDocsQuery doesn't support IntervalIterators"); + } } private class MatchAllDocsWeight extends Weight { @@ -110,7 +116,7 @@ public class MatchAllDocsQuery extends Query { @Override public Scorer scorer(AtomicReaderContext context, boolean scoreDocsInOrder, - boolean topScorer, Bits acceptDocs) throws IOException { + boolean topScorer, PostingFeatures flags, Bits acceptDocs) throws IOException { return new MatchAllScorer(context.reader(), acceptDocs, this, queryWeight); } diff --git a/lucene/core/src/java/org/apache/lucene/search/MultiPhraseQuery.java b/lucene/core/src/java/org/apache/lucene/search/MultiPhraseQuery.java index 86494af..1933cec 100644 --- a/lucene/core/src/java/org/apache/lucene/search/MultiPhraseQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/MultiPhraseQuery.java @@ -1,6 +1,6 @@ package org.apache.lucene.search; -/* +/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. @@ -17,26 +17,16 @@ package org.apache.lucene.search; * limitations under the License. */ -import java.io.IOException; -import java.util.*; - -import org.apache.lucene.index.AtomicReaderContext; -import org.apache.lucene.index.DocsAndPositionsEnum; -import org.apache.lucene.index.AtomicReader; -import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.IndexReaderContext; -import org.apache.lucene.index.Term; -import org.apache.lucene.index.TermContext; -import org.apache.lucene.index.TermState; -import org.apache.lucene.index.Terms; -import org.apache.lucene.index.TermsEnum; -import org.apache.lucene.search.similarities.Similarity.SloppySimScorer; +import org.apache.lucene.index.*; +import org.apache.lucene.search.PhraseQuery.TermDocsEnumFactory; +import org.apache.lucene.search.Weight.PostingFeatures; import org.apache.lucene.search.similarities.Similarity; -import org.apache.lucene.util.ArrayUtil; -import org.apache.lucene.util.Bits; -import org.apache.lucene.util.BytesRef; +import org.apache.lucene.search.similarities.Similarity.SloppySimScorer; +import org.apache.lucene.util.*; import org.apache.lucene.util.PriorityQueue; -import org.apache.lucene.util.ToStringUtils; + +import java.io.IOException; +import java.util.*; /** * MultiPhraseQuery is a generalized version of PhraseQuery, with an added @@ -174,7 +164,7 @@ public class MultiPhraseQuery extends Query { @Override public Scorer scorer(AtomicReaderContext context, boolean scoreDocsInOrder, - boolean topScorer, Bits acceptDocs) throws IOException { + boolean topScorer, PostingFeatures flags, Bits acceptDocs) throws IOException { assert !termArrays.isEmpty(); final AtomicReader reader = context.reader(); final Bits liveDocs = acceptDocs; @@ -194,7 +184,7 @@ public class MultiPhraseQuery extends Query { final DocsAndPositionsEnum postingsEnum; int docFreq; - + TermDocsEnumFactory factory; if (terms.length > 1) { postingsEnum = new UnionDocsAndPositionsEnum(liveDocs, context, terms, termContexts, termsEnum); @@ -216,6 +206,7 @@ public class MultiPhraseQuery extends Query { // None of the terms are in this reader return null; } + factory = new MultiTermDocsEnumFactory(liveDocs, context, terms, termContexts, termsEnum, flags); } else { final Term term = terms[0]; TermState termState = termContexts.get(term).get(context.ord); @@ -232,10 +223,10 @@ public class MultiPhraseQuery extends Query { throw new IllegalStateException("field \"" + term.field() + "\" was indexed without position data; cannot run PhraseQuery (term=" + term.text() + ")"); } - docFreq = termsEnum.docFreq(); + factory = new TermDocsEnumFactory(BytesRef.deepCopyOf(term.bytes()), termsEnum, flags, acceptDocs); } - - postingsFreqs[pos] = new PhraseQuery.PostingsAndFreq(postingsEnum, docFreq, positions.get(pos).intValue(), terms); + + postingsFreqs[pos] = new PhraseQuery.PostingsAndFreq(postingsEnum, factory, termsEnum.docFreq() , positions.get(pos).intValue(), terms); } // sort by increasing docFreq order @@ -257,7 +248,7 @@ public class MultiPhraseQuery extends Query { @Override public Explanation explain(AtomicReaderContext context, int doc) throws IOException { - Scorer scorer = scorer(context, true, false, context.reader().getLiveDocs()); + Scorer scorer = scorer(context, true, false, PostingFeatures.POSITIONS, context.reader().getLiveDocs()); if (scorer != null) { int newDoc = scorer.advance(doc); if (newDoc == doc) { @@ -401,6 +392,27 @@ public class MultiPhraseQuery extends Query { } return true; } + + private static class MultiTermDocsEnumFactory extends TermDocsEnumFactory { + + AtomicReaderContext context; + Term[] terms; + Map termContexts; + + MultiTermDocsEnumFactory(Bits liveDocs, AtomicReaderContext context, Term[] terms, + Map termContexts, TermsEnum termsEnum, PostingFeatures flags) throws IOException { + super(termsEnum, flags, liveDocs); + this.context = context; + this.terms = terms; + this.termContexts = termContexts; + } + + @Override + public DocsAndPositionsEnum docsAndPositionsEnum() throws IOException { + return new UnionDocsAndPositionsEnum(liveDocs, context, terms, termContexts, termsEnum, flags); + } + + } } /** @@ -429,25 +441,41 @@ class UnionDocsAndPositionsEnum extends DocsAndPositionsEnum { } } - private static final class IntQueue { - private int _arraySize = 16; + // TODO: Reimplement this as int[_arraySize * 3], storing position at i * 3, + // startOffset at i * 3 + 1 and endOffset at i * 3 + 2. Will need to also + // implement a new SorterTemplate to sort the array. + + private static final class PositionQueue { + private int _arraySize = 48; private int _index = 0; private int _lastIndex = 0; private int[] _array = new int[_arraySize]; - final void add(int i) { - if (_lastIndex == _arraySize) + final void add(int pos, int start, int end) { + if (_lastIndex * 3 == _arraySize) growArray(); - _array[_lastIndex++] = i; + _array[_lastIndex * 3] = pos; + _array[_lastIndex * 3 + 1] = start; + _array[_lastIndex * 3 + 2] = end; + _lastIndex += 1; } final int next() { - return _array[_index++]; + return _array[_index++ * 3]; + } + + final int startOffset() { + return _array[(_index - 1) * 3 + 1]; + } + + final int endOffset() { + return _array[(_index - 1) * 3 + 2]; } final void sort() { - Arrays.sort(_array, _index, _lastIndex); + //Arrays.sort(_array, _index, _lastIndex); + sorter.quickSort(_index, _lastIndex - 1); } final void clear() { @@ -465,14 +493,52 @@ class UnionDocsAndPositionsEnum extends DocsAndPositionsEnum { _array = newArray; _arraySize *= 2; } + + private SorterTemplate sorter = new SorterTemplate() { + private int pivot; + + @Override + protected void swap(int i, int j) { + int ti = _array[i * 3]; + int ts = _array[i * 3 + 1]; + int te = _array[i * 3 + 2]; + _array[i * 3] = _array[j * 3]; + _array[i * 3 + 1] = _array[j * 3 + 1]; + _array[i * 3 + 2] = _array[j * 3 + 2]; + _array[j * 3] = ti; + _array[j * 3 + 1] = ts; + _array[j * 3 + 2] = te; + } + + @Override + protected int compare(int i, int j) { + return _array[i * 3] - _array[j * 3]; + } + + @Override + protected void setPivot(int i) { + pivot = i; + } + + @Override + protected int comparePivot(int j) { + return pivot - _array[j * 3]; + } + }; } private int _doc; private int _freq; private DocsQueue _queue; - private IntQueue _posList; + private PositionQueue _posList; + + public UnionDocsAndPositionsEnum(Bits liveDocs, AtomicReaderContext context, Term[] terms, + Map termContexts, TermsEnum termsEnum) throws IOException { + this(liveDocs, context, terms, termContexts, termsEnum, PostingFeatures.POSITIONS); + } - public UnionDocsAndPositionsEnum(Bits liveDocs, AtomicReaderContext context, Term[] terms, Map termContexts, TermsEnum termsEnum) throws IOException { + public UnionDocsAndPositionsEnum(Bits liveDocs, AtomicReaderContext context, Term[] terms, + Map termContexts, TermsEnum termsEnum, PostingFeatures flags) throws IOException { List docsEnums = new LinkedList(); for (int i = 0; i < terms.length; i++) { final Term term = terms[i]; @@ -482,7 +548,7 @@ class UnionDocsAndPositionsEnum extends DocsAndPositionsEnum { continue; } termsEnum.seekExact(term.bytes(), termState); - DocsAndPositionsEnum postings = termsEnum.docsAndPositions(liveDocs, null, 0); + DocsAndPositionsEnum postings = termsEnum.docsAndPositions(liveDocs, null, flags.docsAndPositionsFlags()); if (postings == null) { // term does exist, but has no positions throw new IllegalStateException("field \"" + term.field() + "\" was indexed without position data; cannot run PhraseQuery (term=" + term.text() + ")"); @@ -491,7 +557,7 @@ class UnionDocsAndPositionsEnum extends DocsAndPositionsEnum { } _queue = new DocsQueue(docsEnums); - _posList = new IntQueue(); + _posList = new PositionQueue(); } @Override @@ -513,7 +579,7 @@ class UnionDocsAndPositionsEnum extends DocsAndPositionsEnum { final int freq = postings.freq(); for (int i = 0; i < freq; i++) { - _posList.add(postings.nextPosition()); + _posList.add(postings.nextPosition(), postings.startOffset(), postings.endOffset()); } if (postings.nextDoc() != NO_MORE_DOCS) { @@ -536,12 +602,12 @@ class UnionDocsAndPositionsEnum extends DocsAndPositionsEnum { @Override public int startOffset() { - return -1; + return _posList.startOffset(); } @Override public int endOffset() { - return -1; + return _posList.endOffset(); } @Override @@ -561,7 +627,7 @@ class UnionDocsAndPositionsEnum extends DocsAndPositionsEnum { } @Override - public final int freq() { + public final int freq() throws IOException { return _freq; } diff --git a/lucene/core/src/java/org/apache/lucene/search/PhrasePositions.java b/lucene/core/src/java/org/apache/lucene/search/PhrasePositions.java index c975b01..3f38845 100644 --- a/lucene/core/src/java/org/apache/lucene/search/PhrasePositions.java +++ b/lucene/core/src/java/org/apache/lucene/search/PhrasePositions.java @@ -17,8 +17,10 @@ package org.apache.lucene.search; * limitations under the License. */ +import org.apache.lucene.index.DocsAndPositionsEnum; +import org.apache.lucene.index.Term; + import java.io.IOException; -import org.apache.lucene.index.*; /** * Position of a term in a document that takes into account the term offset within the phrase. @@ -44,6 +46,7 @@ final class PhrasePositions { final boolean next() throws IOException { // increments to next doc doc = postings.nextDoc(); + if (doc == DocIdSetIterator.NO_MORE_DOCS) { return false; } @@ -80,10 +83,14 @@ final class PhrasePositions { /** for debug purposes */ @Override public String toString() { - String s = "d:"+doc+" o:"+offset+" p:"+position+" c:"+count; + String s = "d:"+doc+" offset:"+offset+" position:"+position+" c:"+count; if (rptGroup >=0 ) { s += " rpt:"+rptGroup+",i"+rptInd; } + s += " t: [" + terms[0]; + for (int i = 1; i < terms.length; i++) + s += "," + terms[1]; + s += "]"; return s; } } diff --git a/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java b/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java index 6471b10..b162726 100644 --- a/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java @@ -1,6 +1,6 @@ package org.apache.lucene.search; -/* +/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. @@ -17,28 +17,17 @@ package org.apache.lucene.search; * limitations under the License. */ +import org.apache.lucene.index.*; +import org.apache.lucene.search.Weight.PostingFeatures; +import org.apache.lucene.search.similarities.Similarity; +import org.apache.lucene.search.similarities.Similarity.SloppySimScorer; +import org.apache.lucene.util.*; + import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.Set; -import org.apache.lucene.index.AtomicReaderContext; -import org.apache.lucene.index.DocsAndPositionsEnum; -import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.AtomicReader; -import org.apache.lucene.index.IndexReaderContext; -import org.apache.lucene.index.Term; -import org.apache.lucene.index.TermContext; -import org.apache.lucene.index.TermState; -import org.apache.lucene.index.Terms; -import org.apache.lucene.index.TermsEnum; -import org.apache.lucene.search.similarities.Similarity.SloppySimScorer; -import org.apache.lucene.search.similarities.Similarity; -import org.apache.lucene.util.ArrayUtil; -import org.apache.lucene.util.Bits; -import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.ToStringUtils; - /** A Query that matches documents containing a particular sequence of terms. * A PhraseQuery is built by QueryParser for input like "new york". * @@ -133,13 +122,15 @@ public class PhraseQuery extends Query { } static class PostingsAndFreq implements Comparable { + final TermDocsEnumFactory factory; final DocsAndPositionsEnum postings; final int docFreq; final int position; final Term[] terms; final int nTerms; // for faster comparisons - public PostingsAndFreq(DocsAndPositionsEnum postings, int docFreq, int position, Term... terms) { + public PostingsAndFreq(DocsAndPositionsEnum postings, TermDocsEnumFactory factory, int docFreq, int position, Term... terms) throws IOException { + this.factory = factory; this.postings = postings; this.docFreq = docFreq; this.position = position; @@ -240,7 +231,7 @@ public class PhraseQuery extends Query { @Override public Scorer scorer(AtomicReaderContext context, boolean scoreDocsInOrder, - boolean topScorer, Bits acceptDocs) throws IOException { + boolean topScorer, PostingFeatures flags, Bits acceptDocs) throws IOException { assert !terms.isEmpty(); final AtomicReader reader = context.reader(); final Bits liveDocs = acceptDocs; @@ -262,8 +253,7 @@ public class PhraseQuery extends Query { return null; } te.seekExact(t.bytes(), state); - DocsAndPositionsEnum postingsEnum = te.docsAndPositions(liveDocs, null, 0); - + final DocsAndPositionsEnum postingsEnum = te.docsAndPositions(liveDocs, null, 0); // PhraseQuery on a field that did not index // positions. if (postingsEnum == null) { @@ -271,7 +261,8 @@ public class PhraseQuery extends Query { // term does exist, but has no positions throw new IllegalStateException("field \"" + t.field() + "\" was indexed without position data; cannot run PhraseQuery (term=" + t.text() + ")"); } - postingsFreqs[i] = new PostingsAndFreq(postingsEnum, te.docFreq(), positions.get(i).intValue(), t); + TermDocsEnumFactory factory = new TermDocsEnumFactory(BytesRef.deepCopyOf(t.bytes()), te, flags, acceptDocs); + postingsFreqs[i] = new PostingsAndFreq(postingsEnum, factory, te.docFreq(), positions.get(i).intValue(), t); } // sort by increasing docFreq order @@ -299,7 +290,7 @@ public class PhraseQuery extends Query { @Override public Explanation explain(AtomicReaderContext context, int doc) throws IOException { - Scorer scorer = scorer(context, true, false, context.reader().getLiveDocs()); + Scorer scorer = scorer(context, true, false, PostingFeatures.POSITIONS, context.reader().getLiveDocs()); if (scorer != null) { int newDoc = scorer.advance(doc); if (newDoc == doc) { @@ -397,4 +388,32 @@ public class PhraseQuery extends Query { ^ positions.hashCode(); } + static class TermDocsEnumFactory { + protected final TermsEnum termsEnum; + protected final Bits liveDocs; + protected final BytesRef term; + protected final PostingFeatures flags; + + TermDocsEnumFactory(TermsEnum termsEnum, PostingFeatures flags, Bits liveDocs) { + this(null, termsEnum, flags, liveDocs); + } + + TermDocsEnumFactory(BytesRef term, TermsEnum termsEnum, PostingFeatures flags, Bits liveDocs) { + this.termsEnum = termsEnum; + this.liveDocs = liveDocs; + this.term = term; + this.flags = flags; + } + + + public DocsAndPositionsEnum docsAndPositionsEnum() + throws IOException { + if (term != null) { + assert term != null; + termsEnum.seekExact(term, false); + } + return termsEnum.docsAndPositions(liveDocs, null, flags.docsAndPositionsFlags()); + } + + } } diff --git a/lucene/core/src/java/org/apache/lucene/search/PhraseScorer.java b/lucene/core/src/java/org/apache/lucene/search/PhraseScorer.java index 3de6ce8..5fea88e 100644 --- a/lucene/core/src/java/org/apache/lucene/search/PhraseScorer.java +++ b/lucene/core/src/java/org/apache/lucene/search/PhraseScorer.java @@ -17,10 +17,14 @@ package org.apache.lucene.search; * limitations under the License. */ -import java.io.IOException; - +import org.apache.lucene.index.DocsAndPositionsEnum; +import org.apache.lucene.search.intervals.Interval; +import org.apache.lucene.search.intervals.IntervalCollector; +import org.apache.lucene.search.intervals.IntervalIterator; import org.apache.lucene.search.similarities.Similarity; +import java.io.IOException; + /** Expert: Scoring functionality for phrase queries. *
A document is considered matching if it contains the phrase-query terms * at "valid" positions. What "valid positions" are @@ -37,12 +41,16 @@ abstract class PhraseScorer extends Scorer { private float freq; //phrase frequency in current doc as computed by phraseFreq(). final Similarity.SloppySimScorer docScorer; + protected final PhraseQuery.PostingsAndFreq[] postings; PhraseScorer(Weight weight, PhraseQuery.PostingsAndFreq[] postings, - Similarity.SloppySimScorer docScorer) { + Similarity.SloppySimScorer docScorer) throws IOException { super(weight); this.docScorer = docScorer; - + this.postings = postings; + reset(false); + } + void reset(boolean needsOffsets) throws IOException { // convert tps to a list of phrase positions. // note: phrase-position differs from term-position in that its position // reflects the phrase offset: pp.pos = tp.pos - offset. @@ -77,7 +85,7 @@ abstract class PhraseScorer extends Scorer { return docScorer.score(max.doc, freq); } - private boolean advanceMin(int target) throws IOException { + protected boolean advanceMin(int target) throws IOException { if (!min.skipTo(target)) { max.doc = NO_MORE_DOCS; // for further calls to docID() return false; @@ -129,5 +137,49 @@ abstract class PhraseScorer extends Scorer { @Override public String toString() { return "scorer(" + weight + ")"; } + + + final static class AdvancingIntervalIterator extends IntervalIterator { + + public AdvancingIntervalIterator(Scorer scorer, boolean collectIntervals, final DocsAndPositionsEnum[] enums, final IntervalIterator delegate) { + super(scorer, collectIntervals); + this.enums = enums; + this.delegate = delegate; + } + + private final DocsAndPositionsEnum[] enums; + private final IntervalIterator delegate; + @Override + public int scorerAdvanced(int docId) throws IOException { + assert docId == docID(); + for (DocsAndPositionsEnum oneEnum : enums) { + int advance = oneEnum.advance(docId); + assert advance == docId; + } + delegate.scorerAdvanced(docId); + return docId; + } + + @Override + public Interval next() throws IOException { + return delegate.next(); + } + + @Override + public void collect(IntervalCollector collector) { + delegate.collect(collector); + } + + @Override + public IntervalIterator[] subs(boolean inOrder) { + return delegate.subs(inOrder); + } + + @Override + public int matchDistance() { + return delegate.matchDistance(); + } + + } } diff --git a/lucene/core/src/java/org/apache/lucene/search/QueryWrapperFilter.java b/lucene/core/src/java/org/apache/lucene/search/QueryWrapperFilter.java index 644ac2d..eb85e6a 100644 --- a/lucene/core/src/java/org/apache/lucene/search/QueryWrapperFilter.java +++ b/lucene/core/src/java/org/apache/lucene/search/QueryWrapperFilter.java @@ -20,6 +20,7 @@ package org.apache.lucene.search; import java.io.IOException; import org.apache.lucene.index.AtomicReaderContext; +import org.apache.lucene.search.Weight.PostingFeatures; import org.apache.lucene.util.Bits; /** @@ -57,7 +58,7 @@ public class QueryWrapperFilter extends Filter { return new DocIdSet() { @Override public DocIdSetIterator iterator() throws IOException { - return weight.scorer(privateContext, true, false, acceptDocs); + return weight.scorer(privateContext, true, false, PostingFeatures.DOCS_AND_FREQS, acceptDocs); } @Override public boolean isCacheable() { return false; } diff --git a/lucene/core/src/java/org/apache/lucene/search/ReqExclScorer.java b/lucene/core/src/java/org/apache/lucene/search/ReqExclScorer.java index bf7defe..11f9701 100644 --- a/lucene/core/src/java/org/apache/lucene/search/ReqExclScorer.java +++ b/lucene/core/src/java/org/apache/lucene/search/ReqExclScorer.java @@ -21,6 +21,9 @@ import java.io.IOException; import java.util.Collection; import java.util.Collections; +import org.apache.lucene.search.intervals.ConjunctionIntervalIterator; +import org.apache.lucene.search.intervals.IntervalIterator; + /** A Scorer for queries with a required subscorer * and an excluding (prohibited) sub DocIdSetIterator. *
@@ -128,4 +131,9 @@ class ReqExclScorer extends Scorer { } return doc = toNonExcluded(); } + + @Override + public IntervalIterator intervals(boolean collectIntervals) throws IOException { + return new ConjunctionIntervalIterator(this, collectIntervals, reqScorer.intervals(collectIntervals)); + } } diff --git a/lucene/core/src/java/org/apache/lucene/search/ReqOptSumScorer.java b/lucene/core/src/java/org/apache/lucene/search/ReqOptSumScorer.java index b20dc1e..46485e6 100644 --- a/lucene/core/src/java/org/apache/lucene/search/ReqOptSumScorer.java +++ b/lucene/core/src/java/org/apache/lucene/search/ReqOptSumScorer.java @@ -16,6 +16,9 @@ package org.apache.lucene.search; * limitations under the License. */ +import org.apache.lucene.search.intervals.DisjunctionIntervalIterator; +import org.apache.lucene.search.intervals.IntervalIterator; + import java.io.IOException; import java.util.ArrayList; import java.util.Collection; @@ -86,6 +89,11 @@ class ReqOptSumScorer extends Scorer { } @Override + public IntervalIterator intervals(boolean collectIntervals) throws IOException { + return new DisjunctionIntervalIterator(this, collectIntervals, pullIterators(collectIntervals, reqScorer, optScorer)); + } + + @Override public float freq() throws IOException { // we might have deferred advance() score(); diff --git a/lucene/core/src/java/org/apache/lucene/search/ScoreCachingWrappingScorer.java b/lucene/core/src/java/org/apache/lucene/search/ScoreCachingWrappingScorer.java index cabadf5..1129a03 100644 --- a/lucene/core/src/java/org/apache/lucene/search/ScoreCachingWrappingScorer.java +++ b/lucene/core/src/java/org/apache/lucene/search/ScoreCachingWrappingScorer.java @@ -17,6 +17,8 @@ package org.apache.lucene.search; * limitations under the License. */ +import org.apache.lucene.search.intervals.IntervalIterator; + import java.io.IOException; import java.util.Collection; import java.util.Collections; @@ -86,6 +88,11 @@ public class ScoreCachingWrappingScorer extends Scorer { } @Override + public IntervalIterator intervals(boolean collectIntervals) throws IOException { + return scorer.intervals(collectIntervals); + } + + @Override public Collection getChildren() { return Collections.singleton(new ChildScorer(scorer, "CACHED")); } diff --git a/lucene/core/src/java/org/apache/lucene/search/Scorer.java b/lucene/core/src/java/org/apache/lucene/search/Scorer.java index 5da508c..0e8f518 100644 --- a/lucene/core/src/java/org/apache/lucene/search/Scorer.java +++ b/lucene/core/src/java/org/apache/lucene/search/Scorer.java @@ -17,6 +17,8 @@ package org.apache.lucene.search; * limitations under the License. */ +import org.apache.lucene.search.intervals.IntervalIterator; + import java.io.IOException; import java.util.Collection; import java.util.Collections; @@ -62,6 +64,46 @@ public abstract class Scorer extends DocIdSetIterator { collector.collect(doc); } } + + /** + * Expert: Retrieves an {@link IntervalIterator} for this scorer allowing + * access to position and offset intervals for each matching document. The + * returned iterator is aligned with the scorer that created the iterator and + * should only be advanced to the currently matched document. + * + *

+ * Note: {@link IntervalIterator} can be pulled multiple times from a scorer. + * Yet, there should only be one iterator advanced on the same document. It is + * recommended that the caller maintains a single interval iterator per + * scorer. + *

+ * + * @param collectIntervals + * if true the {@link IntervalIterator} can we used to + * collect all individual sub-intervals this {@link IntervalIterator} + * is composed of via + * {@link IntervalIterator#collect(org.apache.lucene.search.intervals.IntervalCollector)} + * @return an {@link IntervalIterator} over matching intervals + * @throws IOException + * if a low-level I/O error is encountered + */ + public abstract IntervalIterator intervals(boolean collectIntervals) throws IOException; + + /** + * Get the IntervalIterators from a list of scorers + * @param collectIntervals true if positions will be collected + * @param scorers the list of scorers to retrieve IntervalIterators from + * @return a list of IntervalIterators pulled from the passed in Scorers + * @throws java.io.IOException if a low-evel I/O error is encountered + */ + public static IntervalIterator[] pullIterators(boolean collectIntervals, Scorer... scorers) + throws IOException { + IntervalIterator[] iterators = new IntervalIterator[scorers.length]; + for (int i = 0; i < scorers.length; i++) { + iterators[i] = scorers[i].intervals(collectIntervals); + } + return iterators; + } /** * Expert: Collects matching documents in a range. Hook for optimization. diff --git a/lucene/core/src/java/org/apache/lucene/search/SloppyPhraseScorer.java b/lucene/core/src/java/org/apache/lucene/search/SloppyPhraseScorer.java index a143c17..91c7224 100644 --- a/lucene/core/src/java/org/apache/lucene/search/SloppyPhraseScorer.java +++ b/lucene/core/src/java/org/apache/lucene/search/SloppyPhraseScorer.java @@ -20,17 +20,26 @@ package org.apache.lucene.search; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; +import java.util.Collection; import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import org.apache.lucene.index.DocsAndPositionsEnum; import org.apache.lucene.index.Term; +import org.apache.lucene.search.intervals.ConjunctionIntervalIterator; +import org.apache.lucene.search.intervals.Interval; +import org.apache.lucene.search.intervals.IntervalIterator; +import org.apache.lucene.search.intervals.SloppyIntervalIterator; +import org.apache.lucene.search.intervals.TermIntervalIterator; import org.apache.lucene.search.similarities.Similarity; import org.apache.lucene.util.OpenBitSet; final class SloppyPhraseScorer extends PhraseScorer { - +// private MaxLengthPositionIntervalIterator iter; private final int slop; private final int numPostings; private final PhraseQueue pq; // for advancing min position @@ -44,12 +53,22 @@ final class SloppyPhraseScorer extends PhraseScorer { private PhrasePositions[] rptStack; // temporary stack for switching colliding repeating pps SloppyPhraseScorer(Weight weight, PhraseQuery.PostingsAndFreq[] postings, - int slop, Similarity.SloppySimScorer docScorer) { + int slop, Similarity.SloppySimScorer docScorer) throws IOException { super(weight, postings, docScorer); this.slop = slop; this.numPostings = postings==null ? 0 : postings.length; pq = new PhraseQueue(postings.length); +// iter = (MaxLengthPositionIntervalIterator) positions(false, false, false); } + +// String current() { +// StringBuilder b = new StringBuilder(); +// int i = 0; +// for (PhrasePositions phrasePositions : postings) { +// b.append(i++).append(phrasePositions).append("\n"); +// } +// return b.toString(); +// } /** * Score a candidate doc for all slop-valid position-combinations (matches) @@ -71,35 +90,74 @@ final class SloppyPhraseScorer extends PhraseScorer { */ @Override protected float phraseFreq() throws IOException { +// float freq = 0.0f; +// +// if (1 ==1) { +// if (iter.docID() != this.docID()) { +// iter.advanceTo(this.docID()); +// } +// while(iter.next() != null) { +// freq += docScorer.computeSlopFactor(iter.matchLength()); // score match +// } +// +// return freq; +// } +// freq = 0.0f; + if (!initPhrasePositions()) { return 0.0f; } + float freq = 0.0f; PhrasePositions pp = pq.pop(); int matchLength = end - pp.position; int next = pq.top().position; + +// int _lPos = pp.position; +// int _lend = end; +// String _s = current(); +// Term[] _lTerms = pp.terms; while (advancePP(pp)) { if (hasRpts && !advanceRpts(pp)) { break; // pps exhausted } if (pp.position > next) { // done minimizing current match-length if (matchLength <= slop) { +// System.out.println("match: " + _lPos + " " + _lend + " " + Arrays.toString(_lTerms) + " " + matchLength); +// System.out.println(_s); +// System.out.println( docScorer.computeSlopFactor(matchLength)); freq += docScorer.computeSlopFactor(matchLength); // score match } pq.add(pp); pp = pq.pop(); next = pq.top().position; matchLength = end - pp.position; +// _lPos = pp.position; +// _lend = end; +// _lTerms = pp.terms; +// _s = current(); } else { int matchLength2 = end - pp.position; + if (matchLength2 < matchLength) { +// _lPos = pp.position; +// _lend = end; +// _lTerms = pp.terms; +// _s = current(); matchLength = matchLength2; } } } if (matchLength <= slop) { +// System.out.println("match: " + _lPos + " " + _lend + " " + Arrays.toString(_lTerms) + " " + matchLength); +// System.out.println(_s); +// System.out.println( docScorer.computeSlopFactor(matchLength)); + freq += docScorer.computeSlopFactor(matchLength); // score match + } +// System.out.println("res: " + freq + " doc: " + this.docID()); + return freq; } @@ -482,26 +540,55 @@ final class SloppyPhraseScorer extends PhraseScorer { } return tg; } + + @Override + public IntervalIterator intervals(boolean collectIntervals) throws IOException { + Map map = new HashMap(); + List enums = new ArrayList(); + + for (int i = 0; i < postings.length; i++) { + if (postings[i].terms.length > 1) { + throw new UnsupportedOperationException("IntervalIterators for MulitPhraseQuery is not supported"); + } + Term term = postings[i].terms[0]; + IterAndOffsets iterAndOffset; + if (!map.containsKey(term)) { + DocsAndPositionsEnum docsAndPosEnum = postings[i].factory + .docsAndPositionsEnum(); + enums.add(docsAndPosEnum); + iterAndOffset = new IterAndOffsets(new TermIntervalIterator(this, docsAndPosEnum, false, + collectIntervals)); + map.put(term, iterAndOffset); + } else { + iterAndOffset = map.get(term); + } + iterAndOffset.offsets.add(postings[i].position); + } + Collection values = map.values(); + IntervalIterator[] iters = new IntervalIterator[values.size()]; + int i = 0; + for (IterAndOffsets iterAndOffsets : values) { + iters[i++] = SloppyIntervalIterator.create(this, collectIntervals, iterAndOffsets.iter, iterAndOffsets.toIntArray()); + } + return new AdvancingIntervalIterator(this, collectIntervals, enums.toArray(new DocsAndPositionsEnum[enums.size()]), new SloppyIntervalIterator(this, slop, collectIntervals, iters)); + } -// private void printQueue(PrintStream ps, PhrasePositions ext, String title) { -// //if (min.doc != ?) return; -// ps.println(); -// ps.println("---- "+title); -// ps.println("EXT: "+ext); -// PhrasePositions[] t = new PhrasePositions[pq.size()]; -// if (pq.size()>0) { -// t[0] = pq.pop(); -// ps.println(" " + 0 + " " + t[0]); -// for (int i=1; i=0; i--) { -// pq.add(t[i]); -// } -// } -// } + private final static class IterAndOffsets { + final List offsets = new ArrayList(); + final IntervalIterator iter; + + IterAndOffsets(IntervalIterator iter) { + this.iter = iter; + } + + int[] toIntArray() { + int[] array = new int[offsets.size()]; + for (int i = 0; i < array.length; i++) { + array[i] = offsets.get(i).intValue(); + } + return array; + } + } + } diff --git a/lucene/core/src/java/org/apache/lucene/search/TermQuery.java b/lucene/core/src/java/org/apache/lucene/search/TermQuery.java index 7f854f8..dd1f13d 100644 --- a/lucene/core/src/java/org/apache/lucene/search/TermQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/TermQuery.java @@ -21,6 +21,7 @@ import java.io.IOException; import java.util.Set; import org.apache.lucene.index.AtomicReaderContext; +import org.apache.lucene.index.DocsAndPositionsEnum; import org.apache.lucene.index.DocsEnum; import org.apache.lucene.index.AtomicReader; import org.apache.lucene.index.IndexReaderContext; @@ -35,55 +36,68 @@ import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.ToStringUtils; -/** A Query that matches documents containing a term. - This may be combined with other terms with a {@link BooleanQuery}. - */ +/** + * A Query that matches documents containing a term. This may be combined with + * other terms with a {@link BooleanQuery}. + */ public class TermQuery extends Query { private final Term term; private final int docFreq; private final TermContext perReaderTermState; - + final class TermWeight extends Weight { private final Similarity similarity; private final Similarity.SimWeight stats; private final TermContext termStates; - + public TermWeight(IndexSearcher searcher, TermContext termStates) - throws IOException { + throws IOException { assert termStates != null : "TermContext must not be null"; this.termStates = termStates; this.similarity = searcher.getSimilarity(); - this.stats = similarity.computeWeight( - getBoost(), - searcher.collectionStatistics(term.field()), + this.stats = similarity.computeWeight(getBoost(), + searcher.collectionStatistics(term.field()), searcher.termStatistics(term, termStates)); } - + @Override - public String toString() { return "weight(" + TermQuery.this + ")"; } - + public String toString() { + return "weight(" + TermQuery.this + ")"; + } + @Override - public Query getQuery() { return TermQuery.this; } - + public Query getQuery() { + return TermQuery.this; + } + @Override public float getValueForNormalization() { return stats.getValueForNormalization(); } - + @Override public void normalize(float queryNorm, float topLevelBoost) { stats.normalize(queryNorm, topLevelBoost); } - + @Override public Scorer scorer(AtomicReaderContext context, boolean scoreDocsInOrder, - boolean topScorer, Bits acceptDocs) throws IOException { - assert termStates.topReaderContext == ReaderUtil.getTopLevelContext(context) : "The top-reader used to create Weight (" + termStates.topReaderContext + ") is not the same as the current reader's top-reader (" + ReaderUtil.getTopLevelContext(context); + boolean topScorer, PostingFeatures flags, Bits acceptDocs) throws IOException { + assert termStates.topReaderContext == ReaderUtil + .getTopLevelContext(context) : "The top-reader used to create Weight (" + + termStates.topReaderContext + + ") is not the same as the current reader's top-reader (" + + ReaderUtil.getTopLevelContext(context); final TermsEnum termsEnum = getTermsEnum(context); if (termsEnum == null) { return null; } - DocsEnum docs = termsEnum.docs(acceptDocs, null); + DocsEnum docs; + if (!flags.isProximityFeature()) { + docs = termsEnum.docs(acceptDocs, null, flags.docFlags()); + } else { + docs = termsEnum.docsAndPositions(acceptDocs, null, flags.docsAndPositionsFlags()); + } assert docs != null; return new TermScorer(this, docs, similarity.exactSimScorer(stats, context), termsEnum.docFreq()); } @@ -98,90 +112,102 @@ public class TermQuery extends Query { assert termNotInReader(context.reader(), term) : "no termstate found but term exists in reader term=" + term; return null; } - //System.out.println("LD=" + reader.getLiveDocs() + " set?=" + (reader.getLiveDocs() != null ? reader.getLiveDocs().get(0) : "null")); - final TermsEnum termsEnum = context.reader().terms(term.field()).iterator(null); + // System.out.println("LD=" + reader.getLiveDocs() + " set?=" + + // (reader.getLiveDocs() != null ? reader.getLiveDocs().get(0) : "null")); + final TermsEnum termsEnum = context.reader().terms(term.field()) + .iterator(null); termsEnum.seekExact(term.bytes(), state); return termsEnum; } private boolean termNotInReader(AtomicReader reader, Term term) throws IOException { // only called from assert - //System.out.println("TQ.termNotInReader reader=" + reader + " term=" + field + ":" + bytes.utf8ToString()); + // System.out.println("TQ.termNotInReader reader=" + reader + " term=" + + // field + ":" + bytes.utf8ToString()); return reader.docFreq(term) == 0; } @Override - public Explanation explain(AtomicReaderContext context, int doc) throws IOException { - Scorer scorer = scorer(context, true, false, context.reader().getLiveDocs()); + public Explanation explain(AtomicReaderContext context, int doc) + throws IOException { + Scorer scorer = scorer(context, true, false, PostingFeatures.DOCS_AND_FREQS, context.reader() + .getLiveDocs()); if (scorer != null) { int newDoc = scorer.advance(doc); if (newDoc == doc) { float freq = scorer.freq(); ExactSimScorer docScorer = similarity.exactSimScorer(stats, context); ComplexExplanation result = new ComplexExplanation(); - result.setDescription("weight("+getQuery()+" in "+doc+") [" + similarity.getClass().getSimpleName() + "], result of:"); - Explanation scoreExplanation = docScorer.explain(doc, new Explanation(freq, "termFreq=" + freq)); + result.setDescription("weight(" + getQuery() + " in " + doc + ") [" + + similarity.getClass().getSimpleName() + "], result of:"); + Explanation scoreExplanation = docScorer.explain(doc, + new Explanation(freq, "termFreq=" + freq)); result.addDetail(scoreExplanation); result.setValue(scoreExplanation.getValue()); result.setMatch(true); return result; } } - return new ComplexExplanation(false, 0.0f, "no matching term"); + return new ComplexExplanation(false, 0.0f, "no matching term"); } } - + /** Constructs a query for the term t. */ public TermQuery(Term t) { this(t, -1); } - - /** Expert: constructs a TermQuery that will use the - * provided docFreq instead of looking up the docFreq - * against the searcher. */ + + /** + * Expert: constructs a TermQuery that will use the provided docFreq instead + * of looking up the docFreq against the searcher. + */ public TermQuery(Term t, int docFreq) { term = t; this.docFreq = docFreq; perReaderTermState = null; } - /** Expert: constructs a TermQuery that will use the - * provided docFreq instead of looking up the docFreq - * against the searcher. */ + /** + * Expert: constructs a TermQuery that will use the provided docFreq instead + * of looking up the docFreq against the searcher. + */ public TermQuery(Term t, TermContext states) { assert states != null; term = t; docFreq = states.docFreq(); perReaderTermState = states; } - + /** Returns the term of this query. */ - public Term getTerm() { return term; } - + public Term getTerm() { + return term; + } + @Override public Weight createWeight(IndexSearcher searcher) throws IOException { final IndexReaderContext context = searcher.getTopReaderContext(); final TermContext termState; - if (perReaderTermState == null || perReaderTermState.topReaderContext != context) { - // make TermQuery single-pass if we don't have a PRTS or if the context differs! + if (perReaderTermState == null + || perReaderTermState.topReaderContext != context) { + // make TermQuery single-pass if we don't have a PRTS or if the context + // differs! termState = TermContext.build(context, term, true); // cache term lookups! } else { - // PRTS was pre-build for this IS - termState = this.perReaderTermState; + // PRTS was pre-build for this IS + termState = this.perReaderTermState; } - + // we must not ignore the given docFreq - if set use the given value (lie) - if (docFreq != -1) - termState.setDocFreq(docFreq); + if (docFreq != -1) termState.setDocFreq(docFreq); return new TermWeight(searcher, termState); } - + @Override public void extractTerms(Set terms) { terms.add(getTerm()); } - + /** Prints a user-readable version of this query. */ @Override public String toString(String field) { @@ -194,21 +220,20 @@ public class TermQuery extends Query { buffer.append(ToStringUtils.boost(getBoost())); return buffer.toString(); } - + /** Returns true iff o is equal to this. */ @Override public boolean equals(Object o) { - if (!(o instanceof TermQuery)) - return false; - TermQuery other = (TermQuery)o; + if (!(o instanceof TermQuery)) return false; + TermQuery other = (TermQuery) o; return (this.getBoost() == other.getBoost()) - && this.term.equals(other.term); + && this.term.equals(other.term); } - - /** Returns a hash code value for this object.*/ + + /** Returns a hash code value for this object. */ @Override public int hashCode() { return Float.floatToIntBits(getBoost()) ^ term.hashCode(); } - + } diff --git a/lucene/core/src/java/org/apache/lucene/search/TermScorer.java b/lucene/core/src/java/org/apache/lucene/search/TermScorer.java index 3aff7f1..b9e36ac 100644 --- a/lucene/core/src/java/org/apache/lucene/search/TermScorer.java +++ b/lucene/core/src/java/org/apache/lucene/search/TermScorer.java @@ -19,7 +19,10 @@ package org.apache.lucene.search; import java.io.IOException; +import org.apache.lucene.index.DocsAndPositionsEnum; import org.apache.lucene.index.DocsEnum; +import org.apache.lucene.search.intervals.IntervalIterator; +import org.apache.lucene.search.intervals.TermIntervalIterator; import org.apache.lucene.search.similarities.Similarity; /** Expert: A Scorer for documents matching a Term. @@ -92,7 +95,12 @@ final class TermScorer extends Scorer { /** Returns a string representation of this TermScorer. */ @Override public String toString() { return "scorer(" + weight + ")"; } - + + @Override + public IntervalIterator intervals(boolean collectIntervals) throws IOException { + assert docsEnum instanceof DocsAndPositionsEnum; + return new TermIntervalIterator(this, (DocsAndPositionsEnum) docsEnum, false, collectIntervals); + } // TODO: benchmark if the specialized conjunction really benefits // from this, or if instead its from sorting by docFreq, or both diff --git a/lucene/core/src/java/org/apache/lucene/search/TotalHitCountCollector.java b/lucene/core/src/java/org/apache/lucene/search/TotalHitCountCollector.java index 1704d8b..a11ba25 100644 --- a/lucene/core/src/java/org/apache/lucene/search/TotalHitCountCollector.java +++ b/lucene/core/src/java/org/apache/lucene/search/TotalHitCountCollector.java @@ -18,6 +18,7 @@ package org.apache.lucene.search; */ import org.apache.lucene.index.AtomicReaderContext; +import org.apache.lucene.search.Weight.PostingFeatures; /** * Just counts the total number of hits. @@ -41,6 +42,12 @@ public class TotalHitCountCollector extends Collector { } @Override + public PostingFeatures postingFeatures() { + // we don't need frequencies here + return PostingFeatures.DOCS_ONLY; + } + + @Override public void setNextReader(AtomicReaderContext context) { } diff --git a/lucene/core/src/java/org/apache/lucene/search/Weight.java b/lucene/core/src/java/org/apache/lucene/search/Weight.java index 48dd209..42d3f13 100644 --- a/lucene/core/src/java/org/apache/lucene/search/Weight.java +++ b/lucene/core/src/java/org/apache/lucene/search/Weight.java @@ -21,7 +21,10 @@ import java.io.IOException; import org.apache.lucene.index.AtomicReader; // javadocs import org.apache.lucene.index.AtomicReaderContext; +import org.apache.lucene.index.DocsAndPositionsEnum; +import org.apache.lucene.index.DocsEnum; import org.apache.lucene.index.IndexReaderContext; // javadocs +import org.apache.lucene.index.TermsEnum; import org.apache.lucene.search.similarities.Similarity; import org.apache.lucene.util.Bits; @@ -36,7 +39,7 @@ import org.apache.lucene.util.Bits; *

* Since {@link Weight} creates {@link Scorer} instances for a given * {@link AtomicReaderContext} ({@link #scorer(AtomicReaderContext, - * boolean, boolean, Bits)}) + * boolean, boolean, PostingFeatures, Bits)}) * callers must maintain the relationship between the searcher's top-level * {@link IndexReaderContext} and the context used to create a {@link Scorer}. *

@@ -51,7 +54,7 @@ import org.apache.lucene.util.Bits; *

  • The query normalization factor is passed to {@link #normalize(float, float)}. At * this point the weighting is complete. *
  • A Scorer is constructed by - * {@link #scorer(AtomicReaderContext, boolean, boolean, Bits)}. + * {@link #scorer(AtomicReaderContext, boolean, boolean, PostingFeatures, Bits)}. * * * @since 2.9 @@ -103,21 +106,21 @@ public abstract class Weight { * if true, {@link Scorer#score(Collector)} will be called; if false, * {@link Scorer#nextDoc()} and/or {@link Scorer#advance(int)} will * be called. + * @param flags the low level {@link PostingFeatures} for this scorer. * @param acceptDocs * Bits that represent the allowable docs to match (typically deleted docs * but possibly filtering other documents) - * * @return a {@link Scorer} which scores documents in/out-of order. * @throws IOException if there is a low-level I/O error */ public abstract Scorer scorer(AtomicReaderContext context, boolean scoreDocsInOrder, - boolean topScorer, Bits acceptDocs) throws IOException; + boolean topScorer, PostingFeatures flags, Bits acceptDocs) throws IOException; /** * Returns true iff this implementation scores docs only out of order. This * method is used in conjunction with {@link Collector}'s * {@link Collector#acceptsDocsOutOfOrder() acceptsDocsOutOfOrder} and - * {@link #scorer(AtomicReaderContext, boolean, boolean, Bits)} to + * {@link #scorer(AtomicReaderContext, boolean, boolean, PostingFeatures, Bits)} to * create a matching {@link Scorer} instance for a given {@link Collector}, or * vice versa. *

    @@ -125,4 +128,66 @@ public abstract class Weight { * the Scorer scores documents in-order. */ public boolean scoresDocsOutOfOrder() { return false; } + + /** + * Feature flags used to control low-level posting list features. These flags + * all Collectors and scorers to specify their requirements for document + * collection and scoring ahead of time for best performance. + */ + public static enum PostingFeatures { + /**Only document IDs are required for document collection and scoring*/ + DOCS_ONLY(0, 0, false), + /**Document IDs and Term Frequencies are required for document collection and scoring*/ + DOCS_AND_FREQS(DocsEnum.FLAG_FREQS, 0, false), + /**Document IDs, Term Frequencies and Positions are required for document collection and scoring*/ + POSITIONS(DocsEnum.FLAG_FREQS, 0, true), + /**Document IDs, Term Frequencies, Positions and Payloads are required for document collection and scoring*/ + POSITIONS_AND_PAYLOADS(DocsEnum.FLAG_FREQS, DocsAndPositionsEnum.FLAG_PAYLOADS, true), + /**Document IDs, Term Frequencies, Positions and Offsets are required for document collection and scoring*/ + OFFSETS(DocsEnum.FLAG_FREQS, DocsAndPositionsEnum.FLAG_OFFSETS, true), + /**Document IDs, Term Frequencies, Positions, Offsets and Payloads are required for document collection and scoring*/ + OFFSETS_AND_PAYLOADS(DocsEnum.FLAG_FREQS, DocsAndPositionsEnum.FLAG_OFFSETS + | DocsAndPositionsEnum.FLAG_PAYLOADS, true); + + private final int docsAndPositionsFlags; + private final int docFlags; + private final boolean isProximityFeature; + + private PostingFeatures(int docFlags, int docsAndPositionsFlags, boolean isProximityFeature) { + this.docsAndPositionsFlags = docsAndPositionsFlags; + this.docFlags = docFlags; + this.isProximityFeature = isProximityFeature; + } + + /** + * Returns the flags for {@link DocsAndPositionsEnum}. This value should be + * passed to + * {@link TermsEnum#docsAndPositions(Bits, DocsAndPositionsEnum, int)} + * + * @return {@link DocsAndPositionsEnum} flags + */ + public int docsAndPositionsFlags() { + return docsAndPositionsFlags; + } + + /** + * Returns the flags for {@link DocsEnum}. This value should be + * passed to + * {@link TermsEnum#docs(Bits, DocsEnum, int)} + * + * @return {@link DocsEnum} flags + */ + public int docFlags() { + return docFlags; + } + + /** + * Returns true iff the current flags set requires positions + * ie. a {@link DocsAndPositionsEnum}. + */ + public boolean isProximityFeature() { + return isProximityFeature; + } + + } } diff --git a/lucene/core/src/java/org/apache/lucene/search/intervals/BlockIntervalIterator.java b/lucene/core/src/java/org/apache/lucene/search/intervals/BlockIntervalIterator.java new file mode 100644 index 0000000..387ca74 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/search/intervals/BlockIntervalIterator.java @@ -0,0 +1,173 @@ +package org.apache.lucene.search.intervals; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +import org.apache.lucene.search.Scorer; + +import java.io.IOException; +import java.util.Arrays; + +/** + * An IntervalIterator implementing minimum interval semantics for the + * BLOCK operator + * + * See "Efficient Optimally Lazy Algorithms for Minimal-Interval Semantic + * + * @lucene.experimental + */ +public final class BlockIntervalIterator extends IntervalIterator { + private final IntervalIterator[] iterators; + + private static final Interval INFINITE_INTERVAL = new Interval( + Integer.MIN_VALUE, Integer.MIN_VALUE, -1, -1); + private final Interval[] intervals; + private final Interval interval = new Interval( + Integer.MIN_VALUE, Integer.MIN_VALUE, -1, -1); + private final int[] gaps; + + private final int lastIter; + + /** + * Construct a BlockIntervalIterator over a compound IntervalIterator. The + * sub-iterators must be in order and sequential for a match. + * @param collectIntervals true if intervals will be collected + * @param other the compound {@link IntervalIterator} used to extract the individual block iterators + */ + public BlockIntervalIterator(boolean collectIntervals, IntervalIterator other) { + this(collectIntervals, defaultIncrements(other.subs(true).length), other); + } + + /** + * Construct a BlockIntervalIterator over a compound IntervalIterator using + * a supplied increments array. + * @param collectIntervals true if intervals will be collected + * @param increments an array of position increments between the iterators + * @param other the compound {@link IntervalIterator} used to extract the individual block iterators + */ + public BlockIntervalIterator(boolean collectIntervals, int[] increments, IntervalIterator other) { + super(other.getScorer(), collectIntervals); + assert other.subs(true) != null; + iterators = other.subs(true); + assert iterators.length > 1; + intervals = new Interval[iterators.length]; + lastIter = iterators.length - 1; + this.gaps = increments; + } + + /** + * Construct a BlockIntervalIterator over a set of subiterators using a supplied + * increments array + * @param scorer the parent Scorer + * @param increments an array of position increments between the iterators + * @param collectIntervals true if intervals will be collected + * @param iterators the subiterators + */ + public BlockIntervalIterator(Scorer scorer, int[] increments, boolean collectIntervals, + IntervalIterator... iterators) { + super(scorer, collectIntervals); + assert iterators.length > 1; + this.iterators = iterators; + intervals = new Interval[iterators.length]; + lastIter = iterators.length - 1; + this.gaps = increments; + } + + /** + * Construct a BlockIntervalIterator over a set of subiterators + * @param scorer the parent Scorer + * @param collectIntervals true if intervals will be collected + * @param iterators the subiterators + */ + public BlockIntervalIterator(Scorer scorer, boolean collectIntervals, IntervalIterator... iterators) { + this(scorer, defaultIncrements(iterators.length), collectIntervals, iterators); + } + + private static int[] defaultIncrements(int num) { + int[] gaps = new int[num]; + Arrays.fill(gaps, 1); + return gaps; + } + + @Override + public Interval next() throws IOException { + if ((intervals[0] = iterators[0].next()) == null) { + return null; + } + int offset = 0; + for (int i = 1; i < iterators.length;) { + final int gap = gaps[i]; + while (intervals[i].begin + gap <= intervals[i - 1].end) { + if ((intervals[i] = iterators[i].next()) == null) { + return null; + } + } + offset += gap; + if (intervals[i].begin == intervals[i - 1].end + gaps[i]) { + i++; + if (i < iterators.length && intervals[i] == INFINITE_INTERVAL) { + // advance only if really necessary + iterators[i].scorerAdvanced(docID()); + assert iterators[i].docID() == docID(); + } + } else { + do { + if ((intervals[0] = iterators[0].next()) == null) { + return null; + } + } while (intervals[0].begin < intervals[i].end - offset); + + i = 1; + } + } + interval.begin = intervals[0].begin; + interval.end = intervals[lastIter].end; + interval.offsetBegin = intervals[0].offsetBegin; + interval.offsetEnd = intervals[lastIter].offsetEnd; + return interval; + } + + @Override + public IntervalIterator[] subs(boolean inOrder) { + return iterators; + } + + @Override + public void collect(IntervalCollector collector) { + assert collectIntervals; + collector.collectComposite(scorer, interval, docID()); + for (IntervalIterator iter : iterators) { + iter.collect(collector); + } + } + + @Override + public int scorerAdvanced(int docId) throws IOException { + iterators[0].scorerAdvanced(docId); + assert iterators[0].docID() == docId; + iterators[1].scorerAdvanced(docId); + assert iterators[1].docID() == docId; + Arrays.fill(intervals, INFINITE_INTERVAL); + return docId; + } + + @Override + public int matchDistance() { + return intervals[lastIter].begin - intervals[0].end; + } +} diff --git a/lucene/core/src/java/org/apache/lucene/search/intervals/BrouwerianIntervalIterator.java b/lucene/core/src/java/org/apache/lucene/search/intervals/BrouwerianIntervalIterator.java new file mode 100644 index 0000000..e954108 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/search/intervals/BrouwerianIntervalIterator.java @@ -0,0 +1,98 @@ +package org.apache.lucene.search.intervals; + +import org.apache.lucene.search.Scorer; + +import java.io.IOException; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * IntervalIterator based on minimum interval semantics for the Brouwerian + * operator. This {@link IntervalIterator} computes the different M-S + * between the anti-chains M (minuend) and S (subtracted). + *

    + * + * + * See "Efficient Optimally Lazy Algorithms for Minimal-Interval Semantic + */ +public class BrouwerianIntervalIterator extends IntervalIterator { + + private final IntervalIterator minuend; + private final IntervalIterator subtracted; + private Interval subtractedInterval = new Interval(); + private Interval currentInterval = new Interval(); + + /** + * Construct a new BrouwerianIntervalIterator over a minuend and a subtrahend + * IntervalIterator + * @param scorer the parent Scorer + * @param collectIntervals true if intervals will be collected + * @param minuend the minuend IntervalIterator + * @param subtracted the subtrahend IntervalIterator + */ + public BrouwerianIntervalIterator(Scorer scorer, boolean collectIntervals, IntervalIterator minuend, IntervalIterator subtracted) { + super(scorer, collectIntervals); + this.minuend = minuend; + this.subtracted = subtracted; + } + + @Override + public int scorerAdvanced(int docId) throws IOException { + subtractedInterval.reset(); + minuend.scorerAdvanced(docId); + subtracted.scorerAdvanced(docId); + return docId; + } + + @Override + public Interval next() throws IOException { + if (subtracted.docID() != minuend.docID()) { + return currentInterval = minuend.next(); + } + while ((currentInterval = minuend.next()) != null) { + while(subtractedInterval.lessThan(currentInterval) && (subtractedInterval = subtracted.next()) != null) { + } + if (subtractedInterval == null || subtractedInterval.greaterThan(currentInterval)) { + return currentInterval; + } + } + return currentInterval; + } + + @Override + public void collect(IntervalCollector collector) { + assert collectIntervals; + collector.collectComposite(scorer, currentInterval, docID()); + minuend.collect(collector); + + } + + @Override + public IntervalIterator[] subs(boolean inOrder) { + return new IntervalIterator[] {minuend, subtracted}; + } + + + @Override + public int matchDistance() { + return minuend.matchDistance(); + } + +} diff --git a/lucene/core/src/java/org/apache/lucene/search/intervals/ConjunctionIntervalIterator.java b/lucene/core/src/java/org/apache/lucene/search/intervals/ConjunctionIntervalIterator.java new file mode 100644 index 0000000..06ad314 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/search/intervals/ConjunctionIntervalIterator.java @@ -0,0 +1,172 @@ +package org.apache.lucene.search.intervals; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +import org.apache.lucene.search.Scorer; +import org.apache.lucene.search.intervals.IntervalQueue.IntervalRef; + +import java.io.IOException; + +/** + * ConjuctionIntervalIterator based on minimal interval semantics for AND + * operator. + * + * See "Efficient Optimally Lazy Algorithms for Minimal-Interval Semantic + * + * @lucene.experimental + */ +public final class ConjunctionIntervalIterator extends IntervalIterator { + + private final IntervalQueueAnd queue; + private final int nrMustMatch; + private SnapshotPositionCollector snapshot; + private final IntervalIterator[] iterators; + private int rightExtremeBegin; + + + /** + * Create a new ConjunctionIntervalIterator over a set of subiterators + * @param scorer the parent scorer + * @param collectIntervals true if intervals will be collected + * @param iterators a list of iterators to combine + * @throws IOException if a low level I/O exception occurs + */ + public ConjunctionIntervalIterator(Scorer scorer, boolean collectIntervals, + IntervalIterator... iterators) throws IOException { + this(scorer, collectIntervals, iterators.length, iterators); + } + + /** + * Create a new ConjunctionIntervalIterator over a set of subiterators, + * with a minimum number of matching subiterators per document + * @param scorer the parent Scorer + * @param collectIntervals true if intervals will be collected + * @param minimuNumShouldMatch the number of subiterators that should + * match a document for a match to be returned + * @param iterators a list of iterators to combine + * @throws IOException if an low level I/O exception occurs + */ + public ConjunctionIntervalIterator(Scorer scorer, boolean collectIntervals, + int minimuNumShouldMatch, IntervalIterator... iterators) + throws IOException { + super(scorer, collectIntervals); + this.iterators = iterators; + this.queue = new IntervalQueueAnd(iterators.length); + this.nrMustMatch = minimuNumShouldMatch; + } + + private void advance() throws IOException { + final IntervalRef top = queue.top(); + Interval interval = null; + if ((interval = iterators[top.index].next()) != null) { + top.interval = interval; + queue.updateRightExtreme(top); + queue.updateTop(); + } else { + queue.pop(); + } + } + + @Override + public Interval next() throws IOException { + + while (queue.size() >= nrMustMatch + && queue.top().interval.begin == queue.currentCandidate.begin) { + advance(); + } + if (queue.size() < nrMustMatch) { + return null; + } + do { + queue.updateCurrentCandidate(); + Interval top = queue.top().interval; + if (collectIntervals) { + snapShotSubPositions(); // this looks odd? -> see SnapShotCollector below for + // details! + } + if (queue.currentCandidate.begin == top.begin + && queue.currentCandidate.end == top.end) { + return queue.currentCandidate; + } + rightExtremeBegin = queue.rightExtremeBegin; + advance(); + } while (queue.size() >= nrMustMatch && queue.currentCandidate.end == queue.rightExtreme); + return queue.currentCandidate; // TODO support payloads + } + + + @Override + public int scorerAdvanced(final int docId) throws IOException { + if (docId == NO_MORE_DOCS) { + return NO_MORE_DOCS; + } + queue.reset(); + for (int i = 0; i < iterators.length; i++) { + int scorerAdvanced = iterators[i].scorerAdvanced(docId); + assert scorerAdvanced == docId; + final Interval interval = iterators[i].next(); + if (interval != null) { + IntervalRef intervalRef = new IntervalRef(interval, i); // TODO maybe + // reuse? + queue.updateRightExtreme(intervalRef); + queue.add(intervalRef); + } + } + return docId; + } + + @Override + public IntervalIterator[] subs(boolean inOrder) { + return iterators; + } + + + private void snapShotSubPositions() { + if (snapshot == null) { + snapshot = new SnapshotPositionCollector(queue.size()); + } + snapshot.reset(); + collectInternal(snapshot); + } + + private void collectInternal(IntervalCollector collector) { + assert collectIntervals; + collector.collectComposite(scorer, queue.currentCandidate, docID()); + for (IntervalIterator iter : iterators) { + iter.collect(collector); + } + + } + + @Override + public void collect(IntervalCollector collector) { + assert collectIntervals; + if (snapshot == null) { + // we might not be initialized if the first interval matches + collectInternal(collector); + } else { + snapshot.replay(collector); + } + } + + @Override + public int matchDistance() { + return (rightExtremeBegin) - (queue.currentTopEnd) -1; // align the match if pos are adjacent + } +} \ No newline at end of file diff --git a/lucene/core/src/java/org/apache/lucene/search/intervals/DisjunctionIntervalIterator.java b/lucene/core/src/java/org/apache/lucene/search/intervals/DisjunctionIntervalIterator.java new file mode 100644 index 0000000..4c8a9ec --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/search/intervals/DisjunctionIntervalIterator.java @@ -0,0 +1,107 @@ +package org.apache.lucene.search.intervals; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +import org.apache.lucene.search.Scorer; +import org.apache.lucene.search.intervals.IntervalQueue.IntervalRef; + +import java.io.IOException; + +/** + * DisjunctionPositionIterator based on minimal interval semantics for OR + * operator + * + * "Efficient Optimally Lazy Algorithms for Minimal-Interval Semantic + * + * @lucene.experimental + */ +public final class DisjunctionIntervalIterator extends IntervalIterator { + + private final IntervalQueue queue; + private final IntervalIterator[] iterators; + + /** + * Creates a new DisjunctionIntervalIterator over a set of IntervalIterators + * @param scorer the parent Scorer + * @param collectIntervals true if intervals will be collected + * @param intervals the IntervalIterators to iterate over + * @throws IOException if a low-level I/O error is encountered + */ + public DisjunctionIntervalIterator(Scorer scorer, boolean collectIntervals, IntervalIterator... intervals) + throws IOException { + super(scorer, collectIntervals); + this.iterators = intervals; + queue = new IntervalQueueOr(intervals.length); + } + + private void advance() throws IOException { + final IntervalRef top = queue.top(); + Interval interval = null; + if ((interval = iterators[top.index].next()) != null) { + top.interval = interval; + queue.updateTop(); + } else { + queue.pop(); + } + } + + @Override + public Interval next() throws IOException { + while (queue.size() > 0 && queue.top().interval.begin <= queue.currentCandidate.begin) { + advance(); + } + if (queue.size() == 0) { + return null; + } + queue.updateCurrentCandidate(); + return queue.currentCandidate; // TODO support payloads + } + + @Override + public IntervalIterator[] subs(boolean inOrder) { + return iterators; + } + + @Override + public void collect(IntervalCollector collector) { + assert collectIntervals; + collector.collectComposite(scorer, queue.currentCandidate, docID()); + iterators[queue.top().index].collect(collector); + } + + @Override + public int scorerAdvanced(int docId) throws IOException { + queue.reset(); + for (int i = 0; i < iterators.length; i++) { + int scorerAdvanced = iterators[i].scorerAdvanced(docId); + assert iterators[i].docID() == scorerAdvanced : " " + iterators[i]; + + if (scorerAdvanced == docId) { + queue.add(new IntervalRef(iterators[i].next(), i)); + } + } + return this.docID(); + } + + @Override + public int matchDistance() { + return iterators[queue.top().index].matchDistance(); + } + +} \ No newline at end of file diff --git a/lucene/core/src/java/org/apache/lucene/search/intervals/Interval.java b/lucene/core/src/java/org/apache/lucene/search/intervals/Interval.java new file mode 100644 index 0000000..2eb5156 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/search/intervals/Interval.java @@ -0,0 +1,166 @@ +package org.apache.lucene.search.intervals; +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Represents a section of a document that matches a query + */ +public class Interval implements Cloneable { + + /** The position of the start of this Interval */ + public int begin; + + /** The position of the end of this Interval */ + public int end; + + /** The offset of the start of this Interval */ + public int offsetBegin; + + /** The offset of the end of this Interval */ + public int offsetEnd; + + /** An interval that will always compare as less than any other interval */ + public static final Interval INFINITE_INTERVAL = new Interval(); + + /** + * Constructs a new Interval + * @param begin the start position + * @param end the end position + * @param offsetBegin the start offset + * @param offsetEnd the end offset + */ + public Interval(int begin, int end, int offsetBegin, int offsetEnd) { + this.begin = begin; + this.end = end; + this.offsetBegin = offsetBegin; + this.offsetEnd = offsetEnd; + } + + /** + * Constructs a new Interval with no initial values. This + * will always compare as less than any other Interval. + */ + public Interval() { + this(Integer.MIN_VALUE, Integer.MIN_VALUE, -1, -1); + } + + /** + * Update to span the range defined by two other Intervals. + * @param start the first Interval + * @param end the second Interval + */ + public void update(Interval start, Interval end) { + this.begin = start.begin; + this.offsetBegin = start.offsetBegin; + this.end = end.end; + this.offsetEnd = end.offsetEnd; + } + + /** + * Compare with another Interval. + * @param other the comparator + * @return true if both start and end positions are less than + * the comparator. + */ + public boolean lessThanExclusive(Interval other) { + return begin < other.begin && end < other.end; + } + + /** + * Compare with another Interval. + * @param other the comparator + * @return true if both start and end positions are less than + * or equal to the comparator's. + */ + public boolean lessThan(Interval other) { + return begin <= other.begin && end <= other.end; + } + + /** + * Compare with another Interval + * @param other the comparator + * @return true if both start and end positions are greater then + * the comparator's. + */ + public boolean greaterThanExclusive(Interval other) { + return begin > other.begin && end > other.end; + } + + /** + * Compare with another Interval + * @param other the comparator + * @return true if both start and end positions are greater then + * of equal to the comparator's. + */ + public boolean greaterThan(Interval other) { + return begin >= other.begin && end >= other.end; + } + + /** + * Compare with another Interval + * @param other the comparator + * @return true if this Interval contains the comparator + */ + public boolean contains(Interval other) { + return begin <= other.begin && other.end <= end; + } + + /** + * Set all values of this Interval to be equal to another's + * @param other the Interval to copy + */ + public void copy(Interval other) { + begin = other.begin; + end = other.end; + offsetBegin = other.offsetBegin; + offsetEnd = other.offsetEnd; + } + + /** + * Set to a state that will always compare as less than any + * other Interval. + */ + public void reset() { + offsetBegin = offsetEnd = -1; + begin = end = Integer.MIN_VALUE; + } + + /** + * Set to a state that will always compare as more than any + * other Interval. + */ + public void setMaximum() { + offsetBegin = offsetEnd = -1; + begin = end = Integer.MAX_VALUE; + } + + @Override + public Object clone() { + try { + return super.clone(); + } catch (CloneNotSupportedException e) { + throw new RuntimeException(); // should not happen + } + } + + @Override + public String toString() { + return "Interval [begin=" + begin + "(" + offsetBegin + "), end=" + + end + "(" + offsetEnd + ")]"; + } + +} \ No newline at end of file diff --git a/lucene/core/src/java/org/apache/lucene/search/intervals/IntervalCollector.java b/lucene/core/src/java/org/apache/lucene/search/intervals/IntervalCollector.java new file mode 100644 index 0000000..9ddc3f8 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/search/intervals/IntervalCollector.java @@ -0,0 +1,43 @@ +package org.apache.lucene.search.intervals; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.search.Scorer; + +/** + * Used for collecting matching {@link Interval}s from a search + */ +public interface IntervalCollector { + + /** + * Collects an individual term match + * @param scorer the parent scorer + * @param interval the interval to collect + * @param docID the docID of the document matched + */ + public void collectLeafPosition(Scorer scorer, Interval interval, int docID); + + /** + * Collects a composite interval that may have sub-intervals + * @param scorer the parent scorer + * @param interval the interval to collect + * @param docID the docID of the document matched + */ + public void collectComposite(Scorer scorer, Interval interval, int docID); + +} diff --git a/lucene/core/src/java/org/apache/lucene/search/intervals/IntervalFilter.java b/lucene/core/src/java/org/apache/lucene/search/intervals/IntervalFilter.java new file mode 100644 index 0000000..9e8531a --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/search/intervals/IntervalFilter.java @@ -0,0 +1,36 @@ +package org.apache.lucene.search.intervals; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Filters an {@link IntervalIterator} + * + * @see IntervalFilterQuery + */ +public interface IntervalFilter { + + /** + * Filter the passed in IntervalIterator + * @param collectIntervals true if the returned {@link IntervalIterator} will + * be passed to an {@link IntervalCollector} + * @param iter the {@link IntervalIterator} to filter + * @return a filtered {@link IntervalIterator} + */ + public abstract IntervalIterator filter(boolean collectIntervals, IntervalIterator iter); + +} diff --git a/lucene/core/src/java/org/apache/lucene/search/intervals/IntervalFilterQuery.java b/lucene/core/src/java/org/apache/lucene/search/intervals/IntervalFilterQuery.java new file mode 100644 index 0000000..d205cd3 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/search/intervals/IntervalFilterQuery.java @@ -0,0 +1,318 @@ +package org.apache.lucene.search.intervals; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.index.AtomicReaderContext; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.ComplexExplanation; +import org.apache.lucene.search.Explanation; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.Scorer; +import org.apache.lucene.search.Weight; +import org.apache.lucene.search.Weight.PostingFeatures; +import org.apache.lucene.util.Bits; + +import java.io.IOException; +import java.util.Set; + +/** + * A Query that filters the results of an inner {@link Query} using an + * {@link IntervalFilter}. + * + * @see OrderedNearQuery + * @see UnorderedNearQuery + * @see NonOverlappingQuery + * + * @lucene.experimental + */ +public class IntervalFilterQuery extends Query implements Cloneable { + + private Query inner; + private final IntervalFilter filter; + + /** + * Constructs a query using an inner query and an IntervalFilter + * @param inner the query to wrap + * @param filter the filter to restrict results by + */ + public IntervalFilterQuery(Query inner, IntervalFilter filter) { + this.inner = inner; + this.filter = filter; + } + + @Override + public void extractTerms(Set terms) { + inner.extractTerms(terms); + } + + @Override + public Query rewrite(IndexReader reader) throws IOException { + IntervalFilterQuery clone = null; + + Query rewritten = inner.rewrite(reader); + if (rewritten != inner) { + clone = (IntervalFilterQuery) this.clone(); + clone.inner = rewritten; + } + + if (clone != null) { + return clone; // some clauses rewrote + } else { + return this; // no clauses rewrote + } + } + + @Override + public Weight createWeight(IndexSearcher searcher) throws IOException { + return new PositionFilterWeight(inner.createWeight(searcher)); + } + + class PositionFilterWeight extends Weight { + + private final Weight other; + + public PositionFilterWeight(Weight other) { + this.other = other; + } + + @Override + public Explanation explain(AtomicReaderContext context, int doc) + throws IOException { + Scorer scorer = scorer(context, true, false, PostingFeatures.POSITIONS, + context.reader().getLiveDocs()); + if (scorer != null) { + int newDoc = scorer.advance(doc); + if (newDoc == doc) { + return other.explain(context, doc); + } + } + return new ComplexExplanation(false, 0.0f, + "No matching term within position filter"); + } + + @Override + public Scorer scorer(AtomicReaderContext context, boolean scoreDocsInOrder, + boolean topScorer, PostingFeatures flags, Bits acceptDocs) throws IOException { + flags = flags == PostingFeatures.DOCS_AND_FREQS ? PostingFeatures.POSITIONS : flags; + ScorerFactory factory = new ScorerFactory(other, context, topScorer, flags, acceptDocs); + final Scorer scorer = factory.scorer(); + return scorer == null ? null : new PositionFilterScorer(this, scorer, factory); + } + + @Override + public Query getQuery() { + return IntervalFilterQuery.this; + } + + @Override + public float getValueForNormalization() throws IOException { + return other.getValueForNormalization(); + } + + @Override + public void normalize(float norm, float topLevelBoost) { + other.normalize(norm, topLevelBoost); + } + } + + static class ScorerFactory { + final Weight weight; + final AtomicReaderContext context; + final boolean topScorer; + final PostingFeatures flags; + final Bits acceptDocs; + ScorerFactory(Weight weight, + AtomicReaderContext context, boolean topScorer, PostingFeatures flags, + Bits acceptDocs) { + this.weight = weight; + this.context = context; + this.topScorer = topScorer; + this.flags = flags; + this.acceptDocs = acceptDocs; + } + + public Scorer scorer() throws IOException { + return weight.scorer(context, true, topScorer, flags, acceptDocs); + } + + } + + final class PositionFilterScorer extends Scorer { + private final Scorer other; + private IntervalIterator filter; + private Interval current; + private final ScorerFactory factory; + public PositionFilterScorer(Weight weight, Scorer other, ScorerFactory factory) throws IOException { + super(weight); + this.other = other; + this.factory = factory; + this.filter = IntervalFilterQuery.this.filter.filter(false, other.intervals(false)); + } + + @Override + public float score() throws IOException { + return other.score(); + } + + @Override + public IntervalIterator intervals(boolean collectIntervals) throws IOException { + if (collectIntervals) { + final Scorer collectingScorer = factory.scorer(); + final IntervalIterator filter = IntervalFilterQuery.this.filter.filter(true, collectingScorer.intervals(true)); + return new IntervalIterator(this, true) { + + @Override + public int scorerAdvanced(int docId) throws IOException { + docId = collectingScorer.advance(docId); + filter.scorerAdvanced(docId); + return docId; + } + + @Override + public Interval next() throws IOException { + return filter.next(); + } + + @Override + public void collect(IntervalCollector collector) { + filter.collect(collector); + } + + @Override + public IntervalIterator[] subs(boolean inOrder) { + return filter.subs(inOrder); + } + + @Override + public int matchDistance() { + return filter.matchDistance(); + } + + }; + } + + return new IntervalIterator(this, collectIntervals) { + private boolean buffered = true; + @Override + public int scorerAdvanced(int docId) throws IOException { + buffered = true; + assert docId == filter.docID(); + return docId; + } + + @Override + public Interval next() throws IOException { + if (buffered) { + buffered = false; + return current; + } + else if (current != null) { + return current = filter.next(); + } + return null; + } + + @Override + public void collect(IntervalCollector collector) { + filter.collect(collector); + } + + @Override + public IntervalIterator[] subs(boolean inOrder) { + return filter.subs(inOrder); + } + + @Override + public int matchDistance() { + return filter.matchDistance(); + } + + }; + } + + @Override + public int docID() { + return other.docID(); + } + + @Override + public int nextDoc() throws IOException { + int docId = -1; + while ((docId = other.nextDoc()) != Scorer.NO_MORE_DOCS) { + filter.scorerAdvanced(docId); + if ((current = filter.next()) != null) { // just check if there is at least one interval that matches! + return other.docID(); + } + } + return Scorer.NO_MORE_DOCS; + } + + @Override + public int advance(int target) throws IOException { + int docId = other.advance(target); + if (docId == Scorer.NO_MORE_DOCS) { + return NO_MORE_DOCS; + } + do { + filter.scorerAdvanced(docId); + if ((current = filter.next()) != null) { + return other.docID(); + } + } while ((docId = other.nextDoc()) != Scorer.NO_MORE_DOCS); + return NO_MORE_DOCS; + } + + @Override + public float freq() throws IOException { + return other.freq(); + } + + } + + @Override + public String toString(String field) { + return filter.toString() + "(" + inner.toString() + ")"; + } + + @Override + public int hashCode() { + final int prime = 31; + int result = super.hashCode(); + result = prime * result + ((filter == null) ? 0 : filter.hashCode()); + result = prime * result + ((inner == null) ? 0 : inner.hashCode()); + return result; + } + + @Override + public boolean equals(Object obj) { + if (this == obj) return true; + if (!super.equals(obj)) return false; + if (getClass() != obj.getClass()) return false; + IntervalFilterQuery other = (IntervalFilterQuery) obj; + if (filter == null) { + if (other.filter != null) return false; + } else if (!filter.equals(other.filter)) return false; + if (inner == null) { + if (other.inner != null) return false; + } else if (!inner.equals(other.inner)) return false; + return true; + } + +} \ No newline at end of file diff --git a/lucene/core/src/java/org/apache/lucene/search/intervals/IntervalIterator.java b/lucene/core/src/java/org/apache/lucene/search/intervals/IntervalIterator.java new file mode 100644 index 0000000..d114718 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/search/intervals/IntervalIterator.java @@ -0,0 +1,143 @@ +package org.apache.lucene.search.intervals; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +import org.apache.lucene.search.Scorer; + +import java.io.IOException; + +/** + * Iterator over the matching {@link Interval}s of a {@link Scorer} + * + * @lucene.experimental + */ +public abstract class IntervalIterator { + + /** An empty array of IntervalIterators */ + public static final IntervalIterator[] EMPTY = new IntervalIterator[0]; + + /** An IntervalIterator containing no further Intervals */ + public static final IntervalIterator NO_MORE_POSITIONS = new EmptyIntervalIterator(); + + /** Integer representing no more documents */ + public static final int NO_MORE_DOCS = Integer.MAX_VALUE; + + protected final Scorer scorer; + protected final boolean collectIntervals; + + /** + * Constructs an IntervalIterator over a {@link Scorer} + * @param scorer the {@link Scorer} to pull positions from + * @param collectIntervals true if positions will be collected + */ + public IntervalIterator(Scorer scorer, boolean collectIntervals) { + this.scorer = scorer; + this.collectIntervals = collectIntervals; + } + + /** + * Called after the parent scorer has been advanced. If the scorer is + * currently positioned on docId, then subsequent calls to next() will + * return Intervals for that document; otherwise, no Intervals are + * available + * @param docId the document the parent scorer was advanced to + * @return the docId that the scorer is currently positioned at + * @throws IOException if a low-level I/O error is encountered + */ + public abstract int scorerAdvanced(int docId) throws IOException; + + /** + * Get the next Interval on the current document. + * @return the next Interval, or null if there are no remaining Intervals + * @throws IOException if a low-level I/O error is encountered + */ + public abstract Interval next() throws IOException; + + /** + * If intervals are to be collected, this will be called once + * for each Interval returned by the iterator. The constructor + * must have been called with collectIntervals=true. + * @param collector an {@link IntervalCollector} to collect the + * Interval positions + * @see Scorer#intervals(boolean) + */ + public abstract void collect(IntervalCollector collector); + + /** + * Get any sub-iterators + * + * @param inOrder + * true if the sub-iterators should be returned in the same order the + * queries were provided + */ + public abstract IntervalIterator[] subs(boolean inOrder); + + /** + * Get the distance between matching subintervals + */ + public abstract int matchDistance(); + + /** + * Get the current docID + */ + public int docID() { + return scorer.docID(); + } + + /** + * Get this iterator's {@link Scorer} + */ + public Scorer getScorer() { + return scorer; + } + + /** + * An iterator that is always exhausted + */ + private static final class EmptyIntervalIterator extends + IntervalIterator { + + public EmptyIntervalIterator() { + super(null, false); + } + + @Override + public int scorerAdvanced(int docId) throws IOException { + return IntervalIterator.NO_MORE_DOCS; + } + + @Override + public Interval next() throws IOException { + return null; + } + + @Override + public void collect(IntervalCollector collectoc) {} + + @Override + public IntervalIterator[] subs(boolean inOrder) { + return EMPTY; + } + + @Override + public int matchDistance() { + return Integer.MAX_VALUE; + } + + } + +} diff --git a/lucene/core/src/java/org/apache/lucene/search/intervals/IntervalQueue.java b/lucene/core/src/java/org/apache/lucene/search/intervals/IntervalQueue.java new file mode 100644 index 0000000..62b150f --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/search/intervals/IntervalQueue.java @@ -0,0 +1,71 @@ +package org.apache.lucene.search.intervals; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +import org.apache.lucene.search.intervals.IntervalQueue.IntervalRef; +import org.apache.lucene.util.PriorityQueue; + +/** + * Abstract base class for calculating minimal spanning intervals with Queues. + * @see IntervalQueueAnd + * + * @lucene.experimental + * @lucene.internal + */ +abstract class IntervalQueue extends PriorityQueue { + /** + * The current interval spanning the queue + */ + final Interval currentCandidate = new Interval( + Integer.MIN_VALUE, Integer.MIN_VALUE, -1, -1); + + /** + * Creates a new {@link IntervalQueue} with a fixed size + * @param size the size of the queue + */ + public IntervalQueue(int size) { + super(size); + } + + /** + * Clears and resets the queue to its initial values; + */ + void reset() { + clear(); + currentCandidate.reset(); + } + + /** + * Called by the consumer each time the head of the queue was updated + */ + abstract void updateCurrentCandidate(); + + /** + * Holds a reference to an interval and its index. + */ + final static class IntervalRef { + Interval interval; + final int index; + + IntervalRef(Interval interval, int index) { + super(); + this.interval = interval; + this.index = index; + } + } + +} \ No newline at end of file diff --git a/lucene/core/src/java/org/apache/lucene/search/intervals/IntervalQueueAnd.java b/lucene/core/src/java/org/apache/lucene/search/intervals/IntervalQueueAnd.java new file mode 100644 index 0000000..eb6a282 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/search/intervals/IntervalQueueAnd.java @@ -0,0 +1,83 @@ +package org.apache.lucene.search.intervals; +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Queue class for calculating minimal spanning conjunction intervals + * @lucene.experimental + */ +final class IntervalQueueAnd extends IntervalQueue { + + /** the current right extreme positions of the queue */ + int rightExtreme = Integer.MIN_VALUE; + /** the current right extreme offset of the queue */ + int rightExtremeOffset = Integer.MIN_VALUE; + /** the current right extreme begin position*/ + int rightExtremeBegin; + /** the end of the internval on top of the queue*/ + int currentTopEnd; + + /** + * Creates a new {@link IntervalQueueAnd} with a fixed size + * @param size the size of the queue + */ + IntervalQueueAnd(int size) { + super(size); + } + + @Override + void reset () { + super.reset(); + rightExtreme = Integer.MIN_VALUE; + rightExtremeOffset = Integer.MIN_VALUE; + } + + /** + * Updates the right extreme of this queue if the end of the given interval is + * greater or equal than the current right extreme of the queue. + * + * @param intervalRef the interval to compare + */ + void updateRightExtreme(IntervalRef intervalRef) { + final Interval interval = intervalRef.interval; + if (rightExtreme <= interval.end) { + rightExtreme = interval.end; + rightExtremeOffset = interval.offsetEnd; + rightExtremeBegin = interval.begin; + } + } + + @Override + void updateCurrentCandidate() { + final IntervalRef top = top(); + Interval interval = top.interval; + currentCandidate.begin = interval.begin; + currentCandidate.offsetBegin = interval.offsetBegin; + currentCandidate.end = rightExtreme; + currentCandidate.offsetEnd = rightExtremeOffset; + currentTopEnd = interval.end; + + } + + @Override + protected boolean lessThan(IntervalRef left, IntervalRef right) { + final Interval a = left.interval; + final Interval b = right.interval; + return a.begin < b.begin || (a.begin == b.begin && a.end > b.end) || a.offsetBegin < b.offsetBegin; + } + +} diff --git a/lucene/core/src/java/org/apache/lucene/search/intervals/IntervalQueueOr.java b/lucene/core/src/java/org/apache/lucene/search/intervals/IntervalQueueOr.java new file mode 100644 index 0000000..49085e0 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/search/intervals/IntervalQueueOr.java @@ -0,0 +1,44 @@ +package org.apache.lucene.search.intervals; +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * Queue class for calculating minimal spanning disjunct intervals + * @lucene.experimental + */ +final class IntervalQueueOr extends IntervalQueue { + + /** + * Creates a new {@link IntervalQueueOr} with a fixed size + * @param size the size of the queue + */ + IntervalQueueOr(int size) { + super(size); + } + + @Override + void updateCurrentCandidate() { + currentCandidate.copy(top().interval); + } + + @Override + protected boolean lessThan(IntervalRef left, IntervalRef right) { + final Interval a = left.interval; + final Interval b = right.interval; + return a.end < b.end || (a.end == b.end && a.begin >= b.begin); + } + +} diff --git a/lucene/core/src/java/org/apache/lucene/search/intervals/NonOverlappingQuery.java b/lucene/core/src/java/org/apache/lucene/search/intervals/NonOverlappingQuery.java new file mode 100644 index 0000000..5668b9c --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/search/intervals/NonOverlappingQuery.java @@ -0,0 +1,356 @@ +package org.apache.lucene.search.intervals; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.index.AtomicReaderContext; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.Explanation; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.Scorer; +import org.apache.lucene.search.Weight; +import org.apache.lucene.search.Weight.PostingFeatures; +import org.apache.lucene.util.Bits; + +import java.io.IOException; +import java.util.Set; + +/** + * A Query that matches documents containing an interval (the minuend) that + * does not contain another interval (the subtrahend). + * + * As an example, given the following {@link org.apache.lucene.search.BooleanQuery}: + *

    + *   BooleanQuery bq = new BooleanQuery();
    + *   bq.add(new TermQuery(new Term(field, "quick")), BooleanQuery.Occur.MUST);
    + *   bq.add(new TermQuery(new Term(field, "fox")), BooleanQuery.Occur.MUST);
    + * 
    + * + * The document "the quick brown fox" will be matched by this query. But + * create a NonOverlappingQuery using this query as a minuend: + *
    + *   NonOverlappingQuery brq = new NonOverlappingQuery(bq, new TermQuery(new Term(field, "brown")));
    + * 
    + * + * This query will not match "the quick brown fox", because "brown" is found + * within the interval of the boolean query for "quick" and "fox. The query + * will match "the quick fox is brown", because here "brown" is outside + * the minuend's interval. + * + * N.B. Positions must be included in the index for this query to work + * + * Implements the Brouwerian operator as defined in "Efficient Optimally Lazy Algorithms for Minimal-Interval Semantic + * + * @lucene.experimental + * @see BrouwerianIntervalIterator + */ +public final class NonOverlappingQuery extends Query implements Cloneable { + + private Query minuend; + private Query subtrahend; + + /** + * Constructs a Query that matches documents containing intervals of the minuend + * that are not subtended by the subtrahend + * @param minuend the minuend Query + * @param subtrahend the subtrahend Query + */ + public NonOverlappingQuery(Query minuend, Query subtrahend) { + this.minuend = minuend; + this.subtrahend = subtrahend; + } + + @Override + public void extractTerms(Set terms) { + minuend.extractTerms(terms); + subtrahend.extractTerms(terms); + } + + @Override + public Query rewrite(IndexReader reader) throws IOException { + NonOverlappingQuery clone = null; + + Query rewritten = minuend.rewrite(reader); + Query subRewritten = subtrahend.rewrite(reader); + if (rewritten != minuend || subRewritten != subtrahend) { + clone = (NonOverlappingQuery) this.clone(); + clone.minuend = rewritten; + clone.subtrahend = subRewritten; + } + + if (clone != null) { + return clone; // some clauses rewrote + } else { + return this; // no clauses rewrote + } + } + + @Override + public Weight createWeight(IndexSearcher searcher) throws IOException { + return new BrouwerianQueryWeight(minuend.createWeight(searcher), subtrahend.createWeight(searcher)); + } + + class BrouwerianQueryWeight extends Weight { + + private final Weight minuted; + private final Weight subtracted; + + public BrouwerianQueryWeight(Weight minuted, Weight subtracted) { + this.minuted = minuted; + this.subtracted = subtracted; + } + + @Override + public Explanation explain(AtomicReaderContext context, int doc) + throws IOException { + return minuted.explain(context, doc); + } + + @Override + public Scorer scorer(AtomicReaderContext context, boolean scoreDocsInOrder, + boolean topScorer, PostingFeatures flags, Bits acceptDocs) throws IOException { + flags = flags == PostingFeatures.DOCS_AND_FREQS ? PostingFeatures.POSITIONS : flags; + ScorerFactory factory = new ScorerFactory(minuted, subtracted, context, topScorer, flags, acceptDocs); + final Scorer scorer = factory.minutedScorer(); + final Scorer subScorer = factory.subtractedScorer(); + if (subScorer == null) { + return scorer; + } + return scorer == null ? null : new PositionFilterScorer(this, scorer, subScorer, factory); + } + + @Override + public Query getQuery() { + return NonOverlappingQuery.this; + } + + @Override + public float getValueForNormalization() throws IOException { + return minuted.getValueForNormalization(); + } + + @Override + public void normalize(float norm, float topLevelBoost) { + minuted.normalize(norm, topLevelBoost); + } + } + + static class ScorerFactory { + final Weight minuted; + final Weight subtracted; + final AtomicReaderContext context; + final boolean topScorer; + final PostingFeatures flags; + final Bits acceptDocs; + ScorerFactory(Weight minuted, Weight subtracted, + AtomicReaderContext context, boolean topScorer, PostingFeatures flags, + Bits acceptDocs) { + this.minuted = minuted; + this.subtracted = subtracted; + this.context = context; + this.topScorer = topScorer; + this.flags = flags; + this.acceptDocs = acceptDocs; + } + + public Scorer minutedScorer() throws IOException { + return minuted.scorer(context, true, topScorer, flags, acceptDocs); + } + + public Scorer subtractedScorer() throws IOException { + return subtracted.scorer(context, true, topScorer, flags, acceptDocs); + } + + } + + final class PositionFilterScorer extends Scorer { + + private final Scorer other; + private IntervalIterator filter; + private final Scorer subtracted; + Interval current; + private final ScorerFactory factory; + + public PositionFilterScorer(Weight weight, Scorer other, Scorer subtracted, ScorerFactory factory) throws IOException { + super(weight); + this.other = other; + this.subtracted = subtracted; + this.filter = new BrouwerianIntervalIterator(other, false, other.intervals(false), subtracted.intervals(false)); + this.factory = factory; + } + + @Override + public float score() throws IOException { + return other.score(); + } + + @Override + public IntervalIterator intervals(boolean collectIntervals) throws IOException { + if (collectIntervals) { + final Scorer minuted = factory.minutedScorer(); + final Scorer subtracted = factory.subtractedScorer(); + final BrouwerianIntervalIterator brouwerianIntervalIterator = new BrouwerianIntervalIterator(subtracted, true, minuted.intervals(true), subtracted.intervals(true)); + return new IntervalIterator(this, collectIntervals) { + + @Override + public int scorerAdvanced(int docId) throws IOException { + docId = minuted.advance(docId); + subtracted.advance(docId); + brouwerianIntervalIterator.scorerAdvanced(docId); + return docId; + } + + @Override + public Interval next() throws IOException { + return brouwerianIntervalIterator.next(); + } + + @Override + public void collect(IntervalCollector collector) { + brouwerianIntervalIterator.collect(collector); + } + + @Override + public IntervalIterator[] subs(boolean inOrder) { + return brouwerianIntervalIterator.subs(inOrder); + } + + @Override + public int matchDistance() { + return brouwerianIntervalIterator.matchDistance(); + } + + }; + } + + + + return new IntervalIterator(this, false) { + private boolean buffered = true; + @Override + public int scorerAdvanced(int docId) throws IOException { + buffered = true; + assert docId == filter.docID(); + return docId; + } + + @Override + public Interval next() throws IOException { + if (buffered) { + buffered = false; + return current; + } + else if (current != null) { + return current = filter.next(); + } + return null; + } + + @Override + public void collect(IntervalCollector collector) { + filter.collect(collector); + } + + @Override + public IntervalIterator[] subs(boolean inOrder) { + return filter.subs(inOrder); + } + + @Override + public int matchDistance() { + return filter.matchDistance(); + } + + }; + } + + @Override + public int docID() { + return other.docID(); + } + + @Override + public int nextDoc() throws IOException { + int docId = -1; + while ((docId = other.nextDoc()) != Scorer.NO_MORE_DOCS) { + subtracted.advance(docId); + filter.scorerAdvanced(docId); + if ((current = filter.next()) != null) { // just check if there is a position that matches! + return other.docID(); + } + } + return Scorer.NO_MORE_DOCS; + } + + @Override + public int advance(int target) throws IOException { + int docId = other.advance(target); + subtracted.advance(docId); + if (docId == Scorer.NO_MORE_DOCS) { + return NO_MORE_DOCS; + } + do { + filter.scorerAdvanced(docId); + if ((current = filter.next()) != null) { + return other.docID(); + } + } while ((docId = other.nextDoc()) != Scorer.NO_MORE_DOCS); + return NO_MORE_DOCS; + } + + @Override + public float freq() throws IOException { + return other.freq(); + } + + } + + @Override + public String toString(String field) { + return "NonOverlappingQuery[" + minuend + ", " + subtrahend + "]"; + } + + @Override + public int hashCode() { + final int prime = 31; + int result = super.hashCode(); + result = prime * result + ((minuend == null) ? 0 : minuend.hashCode()); + result = prime * result + + ((subtrahend == null) ? 0 : subtrahend.hashCode()); + return result; + } + + @Override + public boolean equals(Object obj) { + if (this == obj) return true; + if (!super.equals(obj)) return false; + if (getClass() != obj.getClass()) return false; + NonOverlappingQuery other = (NonOverlappingQuery) obj; + if (minuend == null) { + if (other.minuend != null) return false; + } else if (!minuend.equals(other.minuend)) return false; + if (subtrahend == null) { + if (other.subtrahend != null) return false; + } else if (!subtrahend.equals(other.subtrahend)) return false; + return true; + } + +} \ No newline at end of file diff --git a/lucene/core/src/java/org/apache/lucene/search/intervals/OrderedConjunctionIntervalIterator.java b/lucene/core/src/java/org/apache/lucene/search/intervals/OrderedConjunctionIntervalIterator.java new file mode 100644 index 0000000..887b833 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/search/intervals/OrderedConjunctionIntervalIterator.java @@ -0,0 +1,159 @@ +package org.apache.lucene.search.intervals; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +import org.apache.lucene.search.Scorer; + +import java.io.IOException; + +/** + * An IntervalIterator based on minimum interval semantics for the + * AND< operator + * + * See "Efficient Optimally Lazy Algorithms for Minimal-Interval Semantic + * + * @lucene.experimental + */ +public final class OrderedConjunctionIntervalIterator extends + IntervalIterator { + + private final IntervalIterator[] iterators; + private final Interval[] intervals; + private final int lastIter; + private final Interval interval = new Interval(); + + private int index = 1; + private int matchDistance = 0; + + private SnapshotPositionCollector snapshot = null; + + /** + * Create an OrderedConjunctionIntervalIterator over a composite IntervalIterator + * @param collectIntervals true if intervals will be collected + * @param other a composite IntervalIterator to wrap + */ + public OrderedConjunctionIntervalIterator(boolean collectIntervals, IntervalIterator other) { + this(other.scorer, collectIntervals, other.subs(true)); + } + + /** + * Create an OrderedConjunctionIntervalIterator over a set of subiterators + * @param scorer the parent Scorer + * @param collectIntervals true if intervals will be collected + * @param iterators the subintervals to wrap + */ + public OrderedConjunctionIntervalIterator(Scorer scorer, boolean collectIntervals, IntervalIterator... iterators) { + super(scorer, collectIntervals); + this.iterators = iterators; + assert iterators.length > 1; + intervals = new Interval[iterators.length]; + lastIter = iterators.length - 1; + } + + @Override + public Interval next() throws IOException { + if(intervals[0] == null) { + return null; + } + interval.setMaximum(); + int b = Integer.MAX_VALUE; + while (true) { + while (true) { + final Interval previous = intervals[index - 1]; + if (previous.end >= b) { + return interval.begin == Integer.MAX_VALUE ? null : interval; + } + if (index == intervals.length || intervals[index].begin > previous.end) { + break; + } + Interval current = intervals[index]; + do { + final Interval next; + if (current.end >= b || (next = iterators[index].next()) == null) { + return interval.begin == Integer.MAX_VALUE ? null : interval; + } + current = intervals[index] = next; + } while (current.begin <= previous.end); + index++; + } + interval.update(intervals[0], intervals[lastIter]); + matchDistance = (intervals[lastIter].begin - lastIter) - intervals[0].end; + b = intervals[lastIter].begin; + index = 1; + if (collectIntervals) + snapshotSubPositions(); + intervals[0] = iterators[0].next(); + if (intervals[0] == null) { + return interval.begin == Integer.MAX_VALUE ? null : interval; + } + } + } + + @Override + public IntervalIterator[] subs(boolean inOrder) { + return iterators; + } + + @Override + public void collect(IntervalCollector collector) { + assert collectIntervals; + if (snapshot == null) { + // we might not be initialized if the first interval matches + collectInternal(collector); + } else { + snapshot.replay(collector); + } + } + + private void snapshotSubPositions() { + if (snapshot == null) { + snapshot = new SnapshotPositionCollector(iterators.length); + } + snapshot.reset(); + collectInternal(snapshot); + } + + private void collectInternal(IntervalCollector collector) { + assert collectIntervals; + collector.collectComposite(scorer, interval, docID()); + for (IntervalIterator iter : iterators) { + iter.collect(collector); + } + + } + + @Override + public int scorerAdvanced(int docId) throws IOException { + assert scorer.docID() == docId; + for (int i = 0; i < iterators.length; i++) { + int advanceTo = iterators[i].scorerAdvanced(docId); + assert advanceTo == docId; + intervals[i] = Interval.INFINITE_INTERVAL; + } + intervals[0] = iterators[0].next(); + index = 1; + return scorer.docID(); + } + + @Override + public int matchDistance() { + return matchDistance; + } + +} diff --git a/lucene/core/src/java/org/apache/lucene/search/intervals/OrderedNearQuery.java b/lucene/core/src/java/org/apache/lucene/search/intervals/OrderedNearQuery.java new file mode 100644 index 0000000..69924a0 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/search/intervals/OrderedNearQuery.java @@ -0,0 +1,57 @@ +package org.apache.lucene.search.intervals; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.Query; + +/** + * A query that matches if a set of subqueries also match, and are within + * a given distance of each other within the document. The subqueries + * must appear in the document in order. + * + * N.B. Positions must be included in the index for this query to work + * + * Implements the AND< operator as defined in "Efficient Optimally Lazy Algorithms for Minimal-Interval Semantic + * + * @lucene.experimental + */ + +public class OrderedNearQuery extends IntervalFilterQuery { + + /** + * Constructs an OrderedNearQuery + * @param slop the maximum distance between the subquery matches + * @param subqueries the subqueries to match. + */ + public OrderedNearQuery(int slop, Query... subqueries) { + super(buildBooleanQuery(subqueries), new WithinOrderedFilter(slop + subqueries.length - 1)); + } + + private static BooleanQuery buildBooleanQuery(Query... queries) { + BooleanQuery bq = new BooleanQuery(); + for (Query q : queries) { + bq.add(q, BooleanClause.Occur.MUST); + } + return bq; + } + +} diff --git a/lucene/core/src/java/org/apache/lucene/search/intervals/RangeIntervalFilter.java b/lucene/core/src/java/org/apache/lucene/search/intervals/RangeIntervalFilter.java new file mode 100644 index 0000000..3d81c82 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/search/intervals/RangeIntervalFilter.java @@ -0,0 +1,97 @@ +package org.apache.lucene.search.intervals; +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +/** + * An IntervalFilter that restricts Intervals returned by an IntervalIterator + * to those which occur between a given start and end position. + * + * @lucene.experimental + */ +public class RangeIntervalFilter implements IntervalFilter { + + private int start; + private int end; + + /** + * Constructs a new RangeIntervalFilter + * @param start the start of the filtered range + * @param end the end of the filtered range + */ + public RangeIntervalFilter(int start, int end) { + this.start = start; + this.end = end; + } + + @Override + public IntervalIterator filter(boolean collectIntervals, IntervalIterator iter) { + return new RangeIntervalIterator(collectIntervals, iter); + } + + /** + * Wraps an IntervalIterator ignoring Intervals that fall outside a + * given range. + */ + private class RangeIntervalIterator extends IntervalIterator { + + private final IntervalIterator iterator; + private Interval interval; + + RangeIntervalIterator(boolean collectIntervals, IntervalIterator iter) { + super(iter == null ? null : iter.scorer, collectIntervals); + this.iterator = iter; + } + + @Override + public Interval next() throws IOException { + while ((interval = iterator.next()) != null) { + if(interval.end > end) { + return null; + } else if (interval.begin >= start) { + return interval; + } + } + return null; + } + + @Override + public IntervalIterator[] subs(boolean inOrder) { + return new IntervalIterator[] { iterator }; + } + + @Override + public void collect(IntervalCollector collector) { + assert collectIntervals; + collector.collectComposite(null, interval, iterator.docID()); + iterator.collect(collector); + } + + @Override + public int scorerAdvanced(int docId) throws IOException { + return iterator.scorerAdvanced(docId); + } + + @Override + public int matchDistance() { + return iterator.matchDistance(); + } + + } + +} \ No newline at end of file diff --git a/lucene/core/src/java/org/apache/lucene/search/intervals/SloppyIntervalIterator.java b/lucene/core/src/java/org/apache/lucene/search/intervals/SloppyIntervalIterator.java new file mode 100644 index 0000000..3a275ee --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/search/intervals/SloppyIntervalIterator.java @@ -0,0 +1,235 @@ +package org.apache.lucene.search.intervals; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +import org.apache.lucene.search.Scorer; + +import java.io.IOException; + +/** + * An interval iterator that has the semantics of sloppy phrase query. + */ +public class SloppyIntervalIterator extends IntervalIterator { + private final int maxLen; + private int matchDistance; + private final IntervalIterator iterator; + + /** + * Create a SloppyIntervalIterator that matches subiterators within + * a specified maxLength + * @param scorer the parent Scorer + * @param maxLength the maximum distance between the first and last subiterator match + * @param collectIntervals true if intervals will be collected + * @param iterators the subiterators + * @throws IOException if an low level I/O exception occurs + */ + public SloppyIntervalIterator(Scorer scorer, int maxLength, + boolean collectIntervals, IntervalIterator... iterators) + throws IOException { + super(scorer, collectIntervals); + this.maxLen = maxLength; + this.iterator = new ConjunctionIntervalIterator(scorer, collectIntervals, iterators); + } + + @Override + public Interval next() throws IOException { + Interval current; + do { + if ((current = iterator.next()) != null) { + matchDistance = current.end - current.begin; + if (matchDistance <= maxLen) { +// System.out.println(matchDistance); + break; + } + } else { + break; + } + } while (true); + return current; + } + + @Override + public int scorerAdvanced(int docId) throws IOException { + return iterator.scorerAdvanced(docId); + } + + public int matchDistance() { + return matchDistance; + } + + public static IntervalIterator create(Scorer scorer, boolean collectIntervals, + IntervalIterator iterator, int... offsets) { + if (offsets.length == 1) { + return new SingleSlopplyIntervalIterator(scorer, collectIntervals, iterator, offsets[0]); + } else { + return new SloppyGroupIntervalIterator(scorer, collectIntervals, iterator, offsets); + } + + } + + private final static class SingleSlopplyIntervalIterator extends + IntervalIterator { + private Interval realInterval; + private final Interval sloppyInterval = new Interval(); + private final IntervalIterator iterator; + private int offset; + + public SingleSlopplyIntervalIterator(Scorer scorer, + boolean collectIntervals, IntervalIterator iterator, int offset) { + super(scorer, collectIntervals); + this.iterator = iterator; + this.offset = offset; + } + + @Override + public int scorerAdvanced(int docId) throws IOException { + return iterator.scorerAdvanced(docId); + } + + @Override + public Interval next() throws IOException { + if ((realInterval = iterator.next()) != null) { + sloppyInterval.begin = sloppyInterval.end = realInterval.begin - offset; + sloppyInterval.offsetBegin = realInterval.offsetBegin; + sloppyInterval.offsetEnd = realInterval.offsetEnd; + return sloppyInterval; + } + return null; + } + + @Override + public void collect(IntervalCollector collector) { + collector.collectLeafPosition(scorer, realInterval, docID()); + + } + + @Override + public IntervalIterator[] subs(boolean inOrder) { + return null; + } + + @Override + public int matchDistance() { + return sloppyInterval.end - sloppyInterval.begin; + } + + } + + private final static class SloppyGroupIntervalIterator extends + IntervalIterator { + + private final Interval sloppyGroupInterval = new Interval(); + private final int[] offsets; + private final Interval[] intervalPositions; + private final IntervalIterator groupIterator; + private int currentIndex; + private boolean initialized; + + public SloppyGroupIntervalIterator(Scorer scorer, boolean collectIntervals, + IntervalIterator groupIterator, int... offsets) { + super(scorer, collectIntervals); + this.offsets = offsets; + this.groupIterator = groupIterator; + this.intervalPositions = new Interval[offsets.length]; + for (int i = 0; i < intervalPositions.length; i++) { + intervalPositions[i] = new Interval(); + } + } + + @Override + public int scorerAdvanced(int docId) throws IOException { + initialized = false; + return groupIterator.scorerAdvanced(docId); + } + + @Override + public Interval next() throws IOException { + sloppyGroupInterval.begin = Integer.MAX_VALUE; + sloppyGroupInterval.end = Integer.MIN_VALUE; + if (!initialized) { + initialized = true; + + currentIndex = 0; + for (int i = 0; i < offsets.length; i++) { + Interval current; + if ((current = groupIterator.next()) != null) { + intervalPositions[i].copy(current); + + int p = current.begin - offsets[i]; + sloppyGroupInterval.begin = Math.min(sloppyGroupInterval.begin, p); + sloppyGroupInterval.end = Math.max(sloppyGroupInterval.end, p); + } else { + return null; + } + } + sloppyGroupInterval.offsetBegin = intervalPositions[0].offsetBegin; + sloppyGroupInterval.offsetEnd = intervalPositions[intervalPositions.length-1].offsetEnd; + return sloppyGroupInterval; + } + Interval current; + if ((current = groupIterator.next()) != null) { + final int currentFirst = currentIndex++ % intervalPositions.length; + intervalPositions[currentFirst].copy(current); + int currentIdx = currentIndex; + for (int i = 0; i < intervalPositions.length; i++) { // find min / max + int idx = currentIdx++ % intervalPositions.length; + int p = intervalPositions[idx].begin - offsets[i]; + sloppyGroupInterval.begin = Math.min(sloppyGroupInterval.begin, p); + sloppyGroupInterval.end = Math.max(sloppyGroupInterval.end, p); + } + sloppyGroupInterval.offsetBegin = intervalPositions[currentIndex % intervalPositions.length].offsetBegin; + sloppyGroupInterval.offsetEnd = intervalPositions[currentFirst].offsetEnd; + return sloppyGroupInterval; + } + return null; + } + + @Override + public void collect(IntervalCollector collector) { + int currentIdx = currentIndex+1; + for (int i = 0; i < intervalPositions.length; i++) { // find min / max + int idx = currentIdx++ % intervalPositions.length; + collector.collectLeafPosition(scorer, intervalPositions[idx], + docID()); + } + + } + + @Override + public IntervalIterator[] subs(boolean inOrder) { + return new IntervalIterator[] {groupIterator}; + } + + @Override + public int matchDistance() { + return sloppyGroupInterval.end - sloppyGroupInterval.begin; + } + + } + + @Override + public void collect(IntervalCollector collector) { + assert collectIntervals; + this.iterator.collect(collector); + + } + + @Override + public IntervalIterator[] subs(boolean inOrder) { + return null; + } +} diff --git a/lucene/core/src/java/org/apache/lucene/search/intervals/SnapshotPositionCollector.java b/lucene/core/src/java/org/apache/lucene/search/intervals/SnapshotPositionCollector.java new file mode 100644 index 0000000..7a4c500 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/search/intervals/SnapshotPositionCollector.java @@ -0,0 +1,114 @@ +package org.apache.lucene.search.intervals; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.search.Scorer; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.RamUsageEstimator; + +/** + * An IntervalCollector that allows a snapshot of the state of an + * IntervalIterator to be taken before it is advanced. + * + * Conjunction iterators advance their subiterators before the consumer + * can call collect on the top level iterator. If intervals are to be + * collected, we need to record the last possible match so that we can + * return the correct intervals for the match. + * + * @lucene.internal + */ +final class SnapshotPositionCollector implements IntervalCollector { + + private SingleSnapshot[] snapshots; + private int index = 0; + + /** + * Create a new collector with n snapshots + * @param subs the number of subiterators to record + */ + SnapshotPositionCollector(int subs) { + snapshots = new SingleSnapshot[subs]; + } + + @Override + public void collectLeafPosition(Scorer scorer, Interval interval, + int docID) { + collect(scorer, interval, docID, true); + + } + + private void collect(Scorer scorer, Interval interval, int docID, + boolean isLeaf) { + if (snapshots.length <= index) { + grow(ArrayUtil.oversize(index + 1, + (RamUsageEstimator.NUM_BYTES_OBJECT_REF * 2) + + RamUsageEstimator.NUM_BYTES_OBJECT_HEADER + + RamUsageEstimator.NUM_BYTES_BOOLEAN + + RamUsageEstimator.NUM_BYTES_INT)); + } + if (snapshots[index] == null) { + snapshots[index] = new SingleSnapshot(); + } + snapshots[index++].set(scorer, interval, isLeaf, docID); + } + + @Override + public void collectComposite(Scorer scorer, Interval interval, + int docID) { + collect(scorer, interval, docID, false); + } + + void replay(IntervalCollector collector) { + for (int i = 0; i < index; i++) { + SingleSnapshot singleSnapshot = snapshots[i]; + if (singleSnapshot.isLeaf) { + collector.collectLeafPosition(singleSnapshot.scorer, + singleSnapshot.interval, singleSnapshot.docID); + } else { + collector.collectComposite(singleSnapshot.scorer, + singleSnapshot.interval, singleSnapshot.docID); + } + } + } + + void reset() { + index = 0; + } + + private void grow(int size) { + final SingleSnapshot[] newArray = new SingleSnapshot[size]; + System.arraycopy(snapshots, 0, newArray, 0, index); + snapshots = newArray; + } + + private static final class SingleSnapshot { + Scorer scorer; + final Interval interval = new Interval(); + boolean isLeaf; + int docID; + + void set(Scorer scorer, Interval interval, boolean isLeaf, + int docID) { + this.scorer = scorer; + this.interval.copy(interval); + this.isLeaf = isLeaf; + this.docID = docID; + } + } + +} diff --git a/lucene/core/src/java/org/apache/lucene/search/intervals/TermIntervalIterator.java b/lucene/core/src/java/org/apache/lucene/search/intervals/TermIntervalIterator.java new file mode 100644 index 0000000..1848c2b --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/search/intervals/TermIntervalIterator.java @@ -0,0 +1,125 @@ +package org.apache.lucene.search.intervals; +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.index.DocsAndPositionsEnum; +import org.apache.lucene.search.Scorer; + +import java.io.IOException; + + +/** + * Iterates over the individual positions of a term in a document + */ +public final class TermIntervalIterator extends IntervalIterator { + + private final Interval interval; + int positionsPending; + private final DocsAndPositionsEnum docsAndPos; + private int docID = -1; + + /** + * Constructs a new TermIntervalIterator + * @param scorer the parent Scorer + * @param docsAndPos a DocsAndPositionsEnum positioned on the current document + * @param doPayloads true if payloads should be retrieved for the positions + * @param collectIntervals true if positions will be collected + */ + public TermIntervalIterator(Scorer scorer, DocsAndPositionsEnum docsAndPos, + boolean doPayloads, boolean collectIntervals) { + super(scorer, collectIntervals); + this.docsAndPos = docsAndPos; + this.interval = new Interval(); + } + + @Override + public Interval next() throws IOException { + if (--positionsPending >= 0) { + interval.begin = interval.end = docsAndPos.nextPosition(); + interval.offsetBegin = docsAndPos.startOffset(); + interval.offsetEnd = docsAndPos.endOffset(); + return interval; + } + positionsPending = 0; + return null; + } + + @Override + public int docID() { + return docID; + } + + @Override + public IntervalIterator[] subs(boolean inOrder) { + return EMPTY; + } + + @Override + public void collect(IntervalCollector collector) { + collector.collectLeafPosition(scorer, interval, docID); + } + + @Override + public int scorerAdvanced(int docId) throws IOException { + interval.reset(); + if (docsAndPos.docID() == docId) { + positionsPending = docsAndPos.freq(); + } else { + positionsPending = -1; + } + return docID = docsAndPos.docID(); + } + + @Override + public String toString() { + return "TermPositions [interval=" + interval + ", positionsPending=" + + positionsPending + ", docID=" + docID + "]"; + } + + @Override + public int matchDistance() { + return 0; + } +// TODO not supported yet - need to figure out what that means really to support payloads +// private static final class PayloadInterval extends Interval { +// private int pos = -1; +// private final DocsAndPositionsEnum payloads; +// private final TermIntervalIterator termPos; +// +// public PayloadInterval(DocsAndPositionsEnum payloads, TermIntervalIterator pos) { +// this.payloads = payloads; +// this.termPos = pos; +// } +// +// @Override +// public BytesRef nextPayload() throws IOException { +// if (pos == termPos.positionsPending) { +// return null; +// } else { +// pos = termPos.positionsPending; +// return payloads.getPayload(); +// } +// } +// +// @Override +// public void reset() { +// super.reset(); +// pos = -1; +// } +// +// } +} \ No newline at end of file diff --git a/lucene/core/src/java/org/apache/lucene/search/intervals/UnorderedNearQuery.java b/lucene/core/src/java/org/apache/lucene/search/intervals/UnorderedNearQuery.java new file mode 100644 index 0000000..c2f9399 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/search/intervals/UnorderedNearQuery.java @@ -0,0 +1,57 @@ +package org.apache.lucene.search.intervals; + +/** + * Copyright (c) 2012 Lemur Consulting Ltd. + *

    + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + *

    + * http://www.apache.org/licenses/LICENSE-2.0 + *

    + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.Query; + +/** + * A query that matches if a set of subqueries also match, and are within + * a given distance of each other within the document. The subqueries + * may appear in the document in any order. + * + * N.B. Positions must be included in the index for this query to work + * + * Implements the LOWPASSk operator as defined in "Efficient Optimally Lazy Algorithms for Minimal-Interval Semantic + * + * @lucene.experimental + */ + +public class UnorderedNearQuery extends IntervalFilterQuery { + + /** + * Constructs an OrderedNearQuery + * @param slop the maximum distance between the subquery matches + * @param subqueries the subqueries to match. + */ + public UnorderedNearQuery(int slop, Query... subqueries) { + super(buildBooleanQuery(subqueries), new WithinIntervalFilter(slop + subqueries.length - 1)); + } + + private static BooleanQuery buildBooleanQuery(Query... queries) { + BooleanQuery bq = new BooleanQuery(); + for (Query q : queries) { + bq.add(q, BooleanClause.Occur.MUST); + } + return bq; + } + +} + diff --git a/lucene/core/src/java/org/apache/lucene/search/intervals/WithinIntervalFilter.java b/lucene/core/src/java/org/apache/lucene/search/intervals/WithinIntervalFilter.java new file mode 100644 index 0000000..53b0775 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/search/intervals/WithinIntervalFilter.java @@ -0,0 +1,96 @@ +package org.apache.lucene.search.intervals; +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +import java.io.IOException; + + +/** + * An IntervalFilter that restricts Intervals returned by an IntervalIterator + * to those which have a matchDistance less than a defined slop. + * + * @lucene.experimental + */ +public class WithinIntervalFilter implements IntervalFilter { + + private final int slop; + + /** + * Construct a new WithinIntervalFilter + * @param slop the maximum slop allowed for subintervals + */ + public WithinIntervalFilter(int slop) { + this.slop = slop; + } + + /** + * @return the slop + */ + public int getSlop() { + return slop; + } + + @Override + public IntervalIterator filter(boolean collectIntervals, IntervalIterator iter) { + return new WithinIntervalIterator(collectIntervals, iter); + } + + class WithinIntervalIterator extends IntervalIterator { + + private IntervalIterator iterator; + private Interval interval; + + WithinIntervalIterator(boolean collectIntervals, IntervalIterator iter) { + super(iter == null ? null : iter.scorer, collectIntervals); + this.iterator = iter; + } + + @Override + public Interval next() throws IOException { + while ((interval = iterator.next()) != null) { + if((iterator.matchDistance()) <= slop){ + return interval; + } + } + return null; + } + + @Override + public IntervalIterator[] subs(boolean inOrder) { + return new IntervalIterator[] {iterator}; + } + + + @Override + public void collect(IntervalCollector collector) { + assert collectIntervals; + collector.collectComposite(null, interval, iterator.docID()); + iterator.collect(collector); + } + + @Override + public int scorerAdvanced(int docId) throws IOException { + return iterator.scorerAdvanced(docId); + } + + @Override + public int matchDistance() { + return iterator.matchDistance(); + } + + } + +} diff --git a/lucene/core/src/java/org/apache/lucene/search/intervals/WithinOrderedFilter.java b/lucene/core/src/java/org/apache/lucene/search/intervals/WithinOrderedFilter.java new file mode 100644 index 0000000..527a40a --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/search/intervals/WithinOrderedFilter.java @@ -0,0 +1,49 @@ +package org.apache.lucene.search.intervals; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * An IntervalFilter that restricts an IntervalIterator to return + * only Intervals that occur in order within a given distance. + * + * @see WithinIntervalFilter + */ +public class WithinOrderedFilter implements IntervalFilter { + + private final WithinIntervalFilter innerFilter; + + /** + * Constructs a new WithinOrderedFilter with a given slop + * @param slop The maximum distance allowed between subintervals + */ + public WithinOrderedFilter(int slop) { + this.innerFilter = new WithinIntervalFilter(slop); + } + + @Override + public IntervalIterator filter(boolean collectIntervals, IntervalIterator iter) { + return innerFilter.filter(collectIntervals, + new OrderedConjunctionIntervalIterator(collectIntervals, iter)); + } + + @Override + public String toString() { + return "WithinOrderedFilter[" + this.innerFilter.getSlop() + "]"; + } + +} diff --git a/lucene/core/src/java/org/apache/lucene/search/intervals/package.html b/lucene/core/src/java/org/apache/lucene/search/intervals/package.html new file mode 100644 index 0000000..245bf96 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/search/intervals/package.html @@ -0,0 +1,70 @@ + + + + + org.apache.lucene.search.intervals + + +

    Interval Iterators

    +

    +Lucene offers extensive query and scoring flexibility including boolean queries, specialized phrase queries, wildcards and many more. The intervals package aims +to provide a common interface to Lucene's proximity features available on all core queries. The central class in this package is +{@link org.apache.lucene.search.intervals.IntervalIterator IntervalIterator}, which allows iterative consumption of term positions and offsets on complex queries. +{@link org.apache.lucene.search.Scorer Scorer} exposes direct access to the queries' {@link org.apache.lucene.search.intervals.IntervalIterator IntervalIterator} reflecting a logical view +of the scorer on positions and offsets for each matching document.

    +

    +Intervals are entirely detached from scoring/matching documents and have no effect on query performance if proximity information or offsets are not needed or consumed. Its lazy nature requires +the user to specify the need for positions/offsets at scorer creation time per segment allowing for a large number of usecases: + +

      +
    • Proximity matching without scoring ie. if token positions are needed for filtering out documents but the actual query score should not be modified
    • +
    • Second pass scoring ie. for high-performance proximity queries common practice is to re-score the top N (usually a large N) results of a non-proximity query with proximity information to improve precision.
    • +
    • Collecting an exhaustive list of intervals per query ie. complex queries might be interested in actual term positions across the entire query tree
    • +
    • Highlighting queries without re-analyzing the document or storing term vectors if offsets are stored in the index. Especially large documents will see a tremendous performance and space-consumption improvement over term-vectors / re-analyzing
    • +
    • Specializing queries for exotic proximity operators based on core queries
    • +
    + +

    Core Iterators and Queries

    + +The intervals package provides a basic set of {@link org.apache.lucene.search.intervals.IntervalIterator IntervalIterator} and {@link org.apache.lucene.search.Query Query} implementation +based on minimum interval semantics, as defined in +"Efficient Optimally Lazy Algorithms for Minimal-Interval Semantics +

    + The following {@link org.apache.lucene.search.intervals.IntervalIterator IntervalIterator} implementations are provided: +

      +
    1. {@link org.apache.lucene.search.intervals.BlockIntervalIterator - BlockIntervalIterator} -- an iterator providing an ordered phrasal operator with given gaps between sub-iterators
    2. +
    3. {@link org.apache.lucene.search.intervals.OrderedConjunctionIntervalIterator - OrderedConjunctionIntervalIterator} -- an iterator providing an ordered non-overlapping conjunction operator
    4. +
    5. {@link org.apache.lucene.search.intervals.ConjunctionIntervalIterator - ConjunctionIntervalIterator} -- an iterator providing a unordered conjunction operator
    6. +
    7. {@link org.apache.lucene.search.intervals.BrouwerianIntervalIterator - BrouwerianIntervalIterator} -- an iterator computing the non-overlapping difference between two iterators
    8. +
    9. {@link org.apache.lucene.search.intervals.DisjunctionIntervalIterator - DisjunctionIntervalIterator} -- an iterator providing a unordered disjunction operator
    10. +
    + All queries require positions to be stored in the index. +

    + +

    + The following Query implementations are provided: +

      +
    1. {@link org.apache.lucene.search.intervals.IntervalFilterQuery - IntervalFilterQuery} -- Filters a Query based on the positions or ranges of its component parts
    2. +
    3. {@link org.apache.lucene.search.intervals.OrderedNearQuery - OrderedNearQuery} -- Filters queries based on the ordered difference between their match positions in a document
    4. +
    5. {@link org.apache.lucene.search.intervals.UnorderedNearQuery - UnorderedNearQuery} -- Filters queries based on the unordered difference between their match positions in a document
    6. +
    7. {@link org.apache.lucene.search.intervals.NonOverlappingQuery - NonOverlappingQuery} -- Filters out queries with overlapping match positions
    8. +
    + All queries require positions to be stored in the index. +

    + + diff --git a/lucene/core/src/java/org/apache/lucene/search/package.html b/lucene/core/src/java/org/apache/lucene/search/package.html index 52817bd..4f6b763 100644 --- a/lucene/core/src/java/org/apache/lucene/search/package.html +++ b/lucene/core/src/java/org/apache/lucene/search/package.html @@ -430,8 +430,9 @@ on the built-in available scoring models and extending or changing Similarity. that scores via a {@link org.apache.lucene.search.similarities.Similarity Similarity} will just defer to the Similarity's implementation: {@link org.apache.lucene.search.similarities.Similarity.SimWeight#normalize SimWeight#normalize(float,float)}.
  • - {@link org.apache.lucene.search.Weight#scorer(org.apache.lucene.index.AtomicReaderContext, boolean, boolean, org.apache.lucene.util.Bits) - scorer(AtomicReaderContext context, boolean scoresDocsInOrder, boolean topScorer, Bits acceptDocs)} — + {@link org.apache.lucene.search.Weight#scorer(org.apache.lucene.index.AtomicReaderContext, boolean, boolean, + org.apache.lucene.search.Weight.PostingFeatures, org.apache.lucene.util.Bits) + scorer(AtomicReaderContext context, boolean scoresDocsInOrder, boolean topScorer, PostingFeatures flags, Bits acceptDocs)} — Construct a new {@link org.apache.lucene.search.Scorer Scorer} for this Weight. See The Scorer Class below for help defining a Scorer. As the name implies, the Scorer is responsible for doing the actual scoring of documents given the Query. diff --git a/lucene/core/src/java/org/apache/lucene/search/payloads/PayloadNearQuery.java b/lucene/core/src/java/org/apache/lucene/search/payloads/PayloadNearQuery.java index ef2c6e5..68a8c26 100644 --- a/lucene/core/src/java/org/apache/lucene/search/payloads/PayloadNearQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/payloads/PayloadNearQuery.java @@ -149,14 +149,14 @@ public class PayloadNearQuery extends SpanNearQuery { @Override public Scorer scorer(AtomicReaderContext context, boolean scoreDocsInOrder, - boolean topScorer, Bits acceptDocs) throws IOException { + boolean topScorer, PostingFeatures flags, Bits acceptDocs) throws IOException { return new PayloadNearSpanScorer(query.getSpans(context, acceptDocs, termContexts), this, similarity, similarity.sloppySimScorer(stats, context)); } @Override public Explanation explain(AtomicReaderContext context, int doc) throws IOException { - PayloadNearSpanScorer scorer = (PayloadNearSpanScorer) scorer(context, true, false, context.reader().getLiveDocs()); + PayloadNearSpanScorer scorer = (PayloadNearSpanScorer) scorer(context, true, false, PostingFeatures.POSITIONS, context.reader().getLiveDocs()); if (scorer != null) { int newDoc = scorer.advance(doc); if (newDoc == doc) { diff --git a/lucene/core/src/java/org/apache/lucene/search/payloads/PayloadTermQuery.java b/lucene/core/src/java/org/apache/lucene/search/payloads/PayloadTermQuery.java index 7c9e9e6..7e4a7f4 100644 --- a/lucene/core/src/java/org/apache/lucene/search/payloads/PayloadTermQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/payloads/PayloadTermQuery.java @@ -80,7 +80,7 @@ public class PayloadTermQuery extends SpanTermQuery { @Override public Scorer scorer(AtomicReaderContext context, boolean scoreDocsInOrder, - boolean topScorer, Bits acceptDocs) throws IOException { + boolean topScorer, PostingFeatures flags, Bits acceptDocs) throws IOException { return new PayloadTermSpanScorer((TermSpans) query.getSpans(context, acceptDocs, termContexts), this, similarity.sloppySimScorer(stats, context)); } @@ -175,7 +175,7 @@ public class PayloadTermQuery extends SpanTermQuery { @Override public Explanation explain(AtomicReaderContext context, int doc) throws IOException { - PayloadTermSpanScorer scorer = (PayloadTermSpanScorer) scorer(context, true, false, context.reader().getLiveDocs()); + PayloadTermSpanScorer scorer = (PayloadTermSpanScorer) scorer(context, true, false, PostingFeatures.POSITIONS, context.reader().getLiveDocs()); if (scorer != null) { int newDoc = scorer.advance(doc); if (newDoc == doc) { diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/SpanScorer.java b/lucene/core/src/java/org/apache/lucene/search/spans/SpanScorer.java index 68a91eb..4774bb5 100644 --- a/lucene/core/src/java/org/apache/lucene/search/spans/SpanScorer.java +++ b/lucene/core/src/java/org/apache/lucene/search/spans/SpanScorer.java @@ -21,6 +21,7 @@ import java.io.IOException; import org.apache.lucene.search.Weight; import org.apache.lucene.search.Scorer; +import org.apache.lucene.search.intervals.IntervalIterator; import org.apache.lucene.search.similarities.Similarity; /** @@ -97,4 +98,9 @@ public class SpanScorer extends Scorer { public float freq() throws IOException { return freq; } + + @Override + public IntervalIterator intervals(boolean collectIntervals) throws IOException { + return null; + } } diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/SpanWeight.java b/lucene/core/src/java/org/apache/lucene/search/spans/SpanWeight.java index 7292e90..9ee8e4e 100644 --- a/lucene/core/src/java/org/apache/lucene/search/spans/SpanWeight.java +++ b/lucene/core/src/java/org/apache/lucene/search/spans/SpanWeight.java @@ -82,7 +82,7 @@ public class SpanWeight extends Weight { @Override public Scorer scorer(AtomicReaderContext context, boolean scoreDocsInOrder, - boolean topScorer, Bits acceptDocs) throws IOException { + boolean topScorer, PostingFeatures flags, Bits acceptDocs) throws IOException { if (stats == null) { return null; } else { @@ -92,7 +92,7 @@ public class SpanWeight extends Weight { @Override public Explanation explain(AtomicReaderContext context, int doc) throws IOException { - Scorer scorer = scorer(context, true, false, context.reader().getLiveDocs()); + Scorer scorer = scorer(context, true, false, PostingFeatures.POSITIONS, context.reader().getLiveDocs()); if (scorer != null) { int newDoc = scorer.advance(doc); if (newDoc == doc) { diff --git a/lucene/core/src/test/org/apache/lucene/search/JustCompileSearch.java b/lucene/core/src/test/org/apache/lucene/search/JustCompileSearch.java index b90790b..355e920 100644 --- a/lucene/core/src/test/org/apache/lucene/search/JustCompileSearch.java +++ b/lucene/core/src/test/org/apache/lucene/search/JustCompileSearch.java @@ -18,13 +18,16 @@ package org.apache.lucene.search; */ import org.apache.lucene.index.AtomicReaderContext; +import org.apache.lucene.index.FieldInvertState; import org.apache.lucene.index.Norm; +import org.apache.lucene.search.intervals.IntervalIterator; import org.apache.lucene.search.similarities.Similarity; import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; -import org.apache.lucene.index.FieldInvertState; import org.apache.lucene.util.PriorityQueue; +import java.io.IOException; + /** * Holds all implementations of classes in the o.a.l.search package as a * back-compatibility test. It does not run any tests per-se, however if @@ -190,7 +193,7 @@ final class JustCompileSearch { static final class JustCompilePhraseScorer extends PhraseScorer { JustCompilePhraseScorer(Weight weight, PhraseQuery.PostingsAndFreq[] postings, - Similarity.SloppySimScorer docScorer) { + Similarity.SloppySimScorer docScorer) throws IOException { super(weight, postings, docScorer); } @@ -198,6 +201,11 @@ final class JustCompileSearch { protected float phraseFreq() { throw new UnsupportedOperationException(UNSUPPORTED_MSG); } + + @Override + public IntervalIterator intervals(boolean collectIntervals) throws IOException { + throw new UnsupportedOperationException(UNSUPPORTED_MSG); + } } @@ -245,6 +253,11 @@ final class JustCompileSearch { public int advance(int target) { throw new UnsupportedOperationException(UNSUPPORTED_MSG); } + + @Override + public IntervalIterator intervals(boolean collectIntervals) throws IOException { + throw new UnsupportedOperationException(UNSUPPORTED_MSG); + } } static final class JustCompileSimilarity extends Similarity { @@ -337,7 +350,7 @@ final class JustCompileSearch { @Override public Scorer scorer(AtomicReaderContext context, boolean scoreDocsInOrder, - boolean topScorer, Bits acceptDocs) { + boolean topScorer, PostingFeatures flags, Bits acceptDocs) { throw new UnsupportedOperationException(UNSUPPORTED_MSG); } diff --git a/lucene/core/src/test/org/apache/lucene/search/TestBooleanOr.java b/lucene/core/src/test/org/apache/lucene/search/TestBooleanOr.java index 22e29b7..a921c09 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestBooleanOr.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestBooleanOr.java @@ -24,6 +24,7 @@ import org.apache.lucene.index.AtomicReaderContext; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.Term; +import org.apache.lucene.search.Weight.PostingFeatures; import org.apache.lucene.store.Directory; import org.apache.lucene.util.FixedBitSet; import org.apache.lucene.util.LuceneTestCase; @@ -182,7 +183,7 @@ public class TestBooleanOr extends LuceneTestCase { Weight w = s.createNormalizedWeight(bq); assertEquals(1, s.getIndexReader().leaves().size()); - Scorer scorer = w.scorer(s.getIndexReader().leaves().get(0), false, true, null); + Scorer scorer = w.scorer(s.getIndexReader().getContext().leaves().get(0), false, true, PostingFeatures.DOCS_AND_FREQS, null); final FixedBitSet hits = new FixedBitSet(docCount); final AtomicInteger end = new AtomicInteger(); diff --git a/lucene/core/src/test/org/apache/lucene/search/TestBooleanQuery.java b/lucene/core/src/test/org/apache/lucene/search/TestBooleanQuery.java index 1eaa747..14bfef2 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestBooleanQuery.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestBooleanQuery.java @@ -17,6 +17,7 @@ package org.apache.lucene.search; * limitations under the License. */ +import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.List; @@ -36,6 +37,11 @@ import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.MultiReader; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.Term; +import org.apache.lucene.search.BooleanClause.Occur; +import org.apache.lucene.search.Weight.PostingFeatures; +import org.apache.lucene.search.intervals.IntervalFilterQuery; +import org.apache.lucene.search.intervals.RangeIntervalFilter; +import org.apache.lucene.search.intervals.WithinIntervalFilter; import org.apache.lucene.search.similarities.DefaultSimilarity; import org.apache.lucene.search.spans.SpanQuery; import org.apache.lucene.search.spans.SpanTermQuery; @@ -234,7 +240,7 @@ public class TestBooleanQuery extends LuceneTestCase { Weight weight = s.createNormalizedWeight(q); Scorer scorer = weight.scorer(s.leafContexts.get(0), - true, false, null); + true, false, PostingFeatures.DOCS_AND_FREQS, null); // First pass: just use .nextDoc() to gather all hits final List hits = new ArrayList(); @@ -252,7 +258,7 @@ public class TestBooleanQuery extends LuceneTestCase { weight = s.createNormalizedWeight(q); scorer = weight.scorer(s.leafContexts.get(0), - true, false, null); + true, false, PostingFeatures.DOCS_AND_FREQS, null); if (VERBOSE) { System.out.println(" iter2=" + iter2); @@ -290,6 +296,57 @@ public class TestBooleanQuery extends LuceneTestCase { r.close(); d.close(); } + + public void testConjunctionPositions() throws IOException { + Directory directory = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random(), directory, + newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()))); + { + Document doc = new Document(); + doc.add(newField( + "field", + "Pease porridge hot! Pease porridge cold! Pease porridge in the pot nine days old! Some like it hot, some" + + " like it cold, Some like it in the pot nine days old! Pease porridge hot! Pease porridge cold!", + TextField.TYPE_STORED)); + writer.addDocument(doc); + } + + { + Document doc = new Document(); + doc.add(newField( + "field", + "Pease porridge cold! Pease porridge hot! Pease porridge in the pot nine days old! Some like it cold, some" + + " like it hot, Some like it in the pot nine days old! Pease porridge cold! Pease porridge hot!", + TextField.TYPE_STORED)); + writer.addDocument(doc); + } + + IndexReader reader = writer.getReader(); + IndexSearcher searcher = new IndexSearcher(reader); + writer.close(); + BooleanQuery query = new BooleanQuery(); + query.add(new BooleanClause(new TermQuery(new Term("field", "porridge")), Occur.MUST)); + query.add(new BooleanClause(new TermQuery(new Term("field", "pease")), Occur.MUST)); + query.add(new BooleanClause(new TermQuery(new Term("field", "hot!")), Occur.MUST)); + + { + IntervalFilterQuery filter = new IntervalFilterQuery(query, new RangeIntervalFilter(0,3)); + TopDocs search = searcher.search(filter, 10); + ScoreDoc[] scoreDocs = search.scoreDocs; + assertEquals(1, search.totalHits); + assertEquals(0, scoreDocs[0].doc); + } + { + IntervalFilterQuery filter = new IntervalFilterQuery(query, new WithinIntervalFilter(3)); + TopDocs search = searcher.search(filter, 10); + ScoreDoc[] scoreDocs = search.scoreDocs; + assertEquals(2, search.totalHits); + assertEquals(0, scoreDocs[0].doc); + assertEquals(1, scoreDocs[1].doc); + } + reader.close(); + directory.close(); + } // LUCENE-4477 / LUCENE-4401: public void testBooleanSpanQuery() throws Exception { diff --git a/lucene/core/src/test/org/apache/lucene/search/TestBooleanScorer.java b/lucene/core/src/test/org/apache/lucene/search/TestBooleanScorer.java index 4bc0fc7..c800aef 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestBooleanScorer.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestBooleanScorer.java @@ -17,6 +17,7 @@ package org.apache.lucene.search; * limitations under the License. */ +import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.List; @@ -29,6 +30,7 @@ import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.search.BooleanQuery.BooleanWeight; +import org.apache.lucene.search.intervals.IntervalIterator; import org.apache.lucene.store.Directory; import org.apache.lucene.util.LuceneTestCase; @@ -91,6 +93,10 @@ public class TestBooleanScorer extends LuceneTestCase @Override public int advance(int target) { return doc = target <= 3000 ? 3000 : NO_MORE_DOCS; } + @Override + public IntervalIterator intervals(boolean collectIntervals) throws IOException { + return null; + } }}; diff --git a/lucene/core/src/test/org/apache/lucene/search/TestCachingCollector.java b/lucene/core/src/test/org/apache/lucene/search/TestCachingCollector.java index b68ce41..3fb50e9 100755 --- a/lucene/core/src/test/org/apache/lucene/search/TestCachingCollector.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestCachingCollector.java @@ -18,6 +18,7 @@ package org.apache.lucene.search; */ import org.apache.lucene.index.AtomicReaderContext; +import org.apache.lucene.search.intervals.IntervalIterator; import org.apache.lucene.util.LuceneTestCase; import java.io.IOException; @@ -46,6 +47,11 @@ public class TestCachingCollector extends LuceneTestCase { @Override public int advance(int target) throws IOException { return 0; } + + @Override + public IntervalIterator intervals(boolean collectIntervals) throws IOException { + return IntervalIterator.NO_MORE_POSITIONS; + } } diff --git a/lucene/core/src/test/org/apache/lucene/search/TestDisjunctionMaxQuery.java b/lucene/core/src/test/org/apache/lucene/search/TestDisjunctionMaxQuery.java index 0a29c6e..622df49 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestDisjunctionMaxQuery.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestDisjunctionMaxQuery.java @@ -31,6 +31,7 @@ import org.apache.lucene.index.FieldInvertState; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.StoredDocument; import org.apache.lucene.index.Term; +import org.apache.lucene.search.Weight.PostingFeatures; import org.apache.lucene.search.similarities.DefaultSimilarity; import org.apache.lucene.search.similarities.Similarity; import org.apache.lucene.store.Directory; @@ -175,7 +176,7 @@ public class TestDisjunctionMaxQuery extends LuceneTestCase { assertTrue(s.getTopReaderContext() instanceof AtomicReaderContext); final Weight dw = s.createNormalizedWeight(dq); AtomicReaderContext context = (AtomicReaderContext)s.getTopReaderContext(); - final Scorer ds = dw.scorer(context, true, false, context.reader().getLiveDocs()); + final Scorer ds = dw.scorer(context, true, false, PostingFeatures.DOCS_AND_FREQS, context.reader().getLiveDocs()); final boolean skipOk = ds.advance(3) != DocIdSetIterator.NO_MORE_DOCS; if (skipOk) { fail("firsttime skipTo found a match? ... " @@ -191,7 +192,7 @@ public class TestDisjunctionMaxQuery extends LuceneTestCase { QueryUtils.check(random(), dq, s); final Weight dw = s.createNormalizedWeight(dq); AtomicReaderContext context = (AtomicReaderContext)s.getTopReaderContext(); - final Scorer ds = dw.scorer(context, true, false, context.reader().getLiveDocs()); + final Scorer ds = dw.scorer(context, true, false, PostingFeatures.DOCS_AND_FREQS, context.reader().getLiveDocs()); assertTrue("firsttime skipTo found no match", ds.advance(3) != DocIdSetIterator.NO_MORE_DOCS); assertEquals("found wrong docid", "d4", r.document(ds.docID()).get("id")); diff --git a/lucene/core/src/test/org/apache/lucene/search/TestPositiveScoresOnlyCollector.java b/lucene/core/src/test/org/apache/lucene/search/TestPositiveScoresOnlyCollector.java index ed9334f..8986dbd 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestPositiveScoresOnlyCollector.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestPositiveScoresOnlyCollector.java @@ -20,9 +20,12 @@ package org.apache.lucene.search; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.Term; +import org.apache.lucene.search.intervals.IntervalIterator; import org.apache.lucene.store.Directory; import org.apache.lucene.util.LuceneTestCase; +import java.io.IOException; + public class TestPositiveScoresOnlyCollector extends LuceneTestCase { private static final class SimpleScorer extends Scorer { @@ -50,6 +53,11 @@ public class TestPositiveScoresOnlyCollector extends LuceneTestCase { idx = target; return idx < scores.length ? idx : NO_MORE_DOCS; } + + @Override + public IntervalIterator intervals(boolean collectIntervals) throws IOException { + throw new UnsupportedOperationException(); + } } // The scores must have positive as well as negative values diff --git a/lucene/core/src/test/org/apache/lucene/search/TestScoreCachingWrappingScorer.java b/lucene/core/src/test/org/apache/lucene/search/TestScoreCachingWrappingScorer.java index dc52313..fb6cc5d 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestScoreCachingWrappingScorer.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestScoreCachingWrappingScorer.java @@ -17,15 +17,16 @@ package org.apache.lucene.search; * limitations under the License. */ -import java.io.IOException; - import org.apache.lucene.index.AtomicReaderContext; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.Term; +import org.apache.lucene.search.intervals.IntervalIterator; import org.apache.lucene.store.Directory; import org.apache.lucene.util.LuceneTestCase; +import java.io.IOException; + public class TestScoreCachingWrappingScorer extends LuceneTestCase { private static final class SimpleScorer extends Scorer { @@ -58,7 +59,11 @@ public class TestScoreCachingWrappingScorer extends LuceneTestCase { doc = target; return doc < scores.length ? doc : NO_MORE_DOCS; } - + + @Override + public IntervalIterator intervals(boolean collectIntervals) throws IOException { + throw new UnsupportedOperationException(); + } } private static final class ScoreCachingCollector extends Collector { diff --git a/lucene/core/src/test/org/apache/lucene/search/TestTermQuery.java b/lucene/core/src/test/org/apache/lucene/search/TestTermQuery.java new file mode 100644 index 0000000..085f755 --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/search/TestTermQuery.java @@ -0,0 +1,294 @@ +package org.apache.lucene.search; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.analysis.MockTokenFilter; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.TextField; +import org.apache.lucene.index.*; +import org.apache.lucene.search.Weight.PostingFeatures; +import org.apache.lucene.search.intervals.Interval; +import org.apache.lucene.search.intervals.IntervalIterator; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.LuceneTestCase; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +/** + * This class contains tests related to {@link TermQuery} + */ +public class TestTermQuery extends LuceneTestCase { + + private String fieldName = "field"; + + /** + * Simple testcase for {@link TermScorer#intervals(boolean)} + */ + public void testPositionsSimple() throws IOException { + Directory directory = newDirectory(); + + final Analyzer analyzer = new MockAnalyzer(random(), + MockTokenizer.WHITESPACE, false, MockTokenFilter.EMPTY_STOPSET, true); + RandomIndexWriter writer = new RandomIndexWriter(random(), directory, + newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer)); + for (int i = 0; i < 39; i++) { + Document doc = new Document(); + doc.add(newField(fieldName, "1 2 3 4 5 6 7 8 9 10 " + + "1 2 3 4 5 6 7 8 9 10 " + "1 2 3 4 5 6 7 8 9 10 " + + "1 2 3 4 5 6 7 8 9 10", TextField.TYPE_STORED)); + writer.addDocument(doc); + } + IndexReader reader = writer.getReader(); + TermsEnum te = MultiFields.getTerms(reader, + fieldName).iterator(null); + te.seekExact(new BytesRef("1"), false); + DocsAndPositionsEnum docsAndPositions = te.docsAndPositions(null, null, DocsAndPositionsEnum.FLAG_PAYLOADS); + assertEquals(39, reader.docFreq(new Term(fieldName, "1"))); + docsAndPositions.nextDoc(); + docsAndPositions.nextPosition(); + boolean payloadsIndexed = false; // TODO we should enable payloads here + + IndexSearcher searcher = new IndexSearcher(reader); + writer.close(); + + for (int i = 0; i < 39 * RANDOM_MULTIPLIER; i++) { + TermQuery one = new TermQuery(new Term(fieldName, "1")); + IndexReaderContext topReaderContext = reader.getContext(); + List leaves = topReaderContext.leaves(); + Weight weight = one.createWeight(searcher); + for (AtomicReaderContext atomicReaderContext : leaves) { + Scorer scorer = weight.scorer(atomicReaderContext, true, true, PostingFeatures.POSITIONS, null); + assertNotNull(scorer); + int toDoc = 1 + random().nextInt(atomicReaderContext.reader().docFreq(new Term(fieldName, "1")) - 1 ); + final int advance = scorer.advance(toDoc); + IntervalIterator positions = scorer.intervals(false); + + do { + assertEquals(scorer.docID(), positions.scorerAdvanced(scorer.docID())); + + Interval interval = null; + String msg = "Advanced to: " + advance + " current doc: " + + scorer.docID() + " usePayloads: " + payloadsIndexed; + assertNotNull(msg, (interval = positions.next())); + assertEquals(msg, 4.0f, positions.getScorer().freq(), 0.0f); + + assertEquals(msg, 0, interval.begin); + assertEquals(msg, 0, interval.end); + checkPayload(0, interval, payloadsIndexed); + + assertNotNull(msg, (interval = positions.next())); + assertEquals(msg, 4.0f, positions.getScorer().freq(), 0.0f); + assertEquals(msg, 10, interval.begin); + assertEquals(msg, 10, interval.end); + checkPayload(10, interval, payloadsIndexed); + + assertNotNull(msg, (interval = positions.next())); + assertEquals(msg, 4.0f, positions.getScorer().freq(), 0.0f); + assertEquals(msg, 20, interval.begin); + assertEquals(msg, 20, interval.end); + checkPayload(20, interval, payloadsIndexed); + + assertNotNull(msg, (interval = positions.next())); + assertEquals(msg, 4.0f, positions.getScorer().freq(), 0.0f); + assertEquals(msg, 30, interval.begin); + assertEquals(msg, 30, interval.end); + checkPayload(30, interval, payloadsIndexed); + + assertNull(msg, (interval = positions.next())); + } while (scorer.nextDoc() != Scorer.NO_MORE_DOCS); + } + } + reader.close(); + directory.close(); + } + + public final void checkPayload(int pos, Interval interval, + boolean payloadsIndexed) throws IOException { + // not supported yet need to figure out how to expose this efficiently +// if (payloadsIndexed) { +// assertNotNull(interval.nextPayload()); +// } else { +// assertNull(interval.nextPayload()); +// } + } + + /** + * this test indexes random numbers within a range into a field and checks + * their occurrences by searching for a number from that range selected at + * random. All positions for that number are safed up front and compared to + * the terms scorers positions. + * + */ + public void testRandomPositons() throws IOException { + Directory dir = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random(), dir, + newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()))); + int numDocs = 131; + int max = 1051; + int term = random().nextInt(max); + Integer[][] positionsInDoc = new Integer[numDocs][]; + for (int i = 0; i < numDocs; i++) { + Document doc = new Document(); + ArrayList positions = new ArrayList(); + StringBuilder builder = new StringBuilder(); + for (int j = 0; j < 3049; j++) { + int nextInt = random().nextInt(max); + builder.append(nextInt).append(" "); + if (nextInt == term) { + positions.add(Integer.valueOf(j)); + } + } + doc.add(newField(fieldName, builder.toString(), TextField.TYPE_STORED)); + positionsInDoc[i] = positions.toArray(new Integer[0]); + writer.addDocument(doc); + } + + IndexReader reader = writer.getReader(); + writer.close(); + IndexSearcher searcher = new IndexSearcher(reader); + + for (int i = 0; i < 39 * RANDOM_MULTIPLIER; i++) { + TermQuery one = new TermQuery(new Term(fieldName, "" + term)); + IndexReaderContext topReaderContext = reader.getContext(); + List leaves = topReaderContext.leaves(); + Weight weight = one.createWeight(searcher); + for (AtomicReaderContext atomicReaderContext : leaves) { + Scorer scorer = weight.scorer(atomicReaderContext, true, true, PostingFeatures.POSITIONS, null); + assertNotNull(scorer); + int initDoc = 0; + int maxDoc = atomicReaderContext.reader().maxDoc(); + // initially advance or do next doc + if (random().nextBoolean()) { + initDoc = scorer.nextDoc(); + } else { + initDoc = scorer.advance(random().nextInt(maxDoc)); + } + // now run through the scorer and check if all positions are there... + do { + int docID = scorer.docID(); + if (docID == Scorer.NO_MORE_DOCS) { + break; + } + IntervalIterator positions = scorer.intervals(false); + Integer[] pos = positionsInDoc[atomicReaderContext.docBase + docID]; + + assertEquals((float) pos.length, positions.getScorer().freq(), 0.0f); + // number of positions read should be random - don't read all of them + // allways + final int howMany = random().nextInt(20) == 0 ? pos.length + - random().nextInt(pos.length) : pos.length; + Interval interval = null; + assertEquals(scorer.docID(), positions.scorerAdvanced(scorer.docID())); + for (int j = 0; j < howMany; j++) { + assertNotNull((interval = positions.next())); + assertEquals("iteration: " + i + " initDoc: " + initDoc + " doc: " + + docID + " base: " + atomicReaderContext.docBase + + " positions: " + Arrays.toString(pos), pos[j].intValue(), + interval.begin); + assertEquals(pos[j].intValue(), interval.end); + } + if (howMany == pos.length) { + assertNull((interval = positions.next())); + } + + if (random().nextInt(10) == 0) { // once is a while advance + scorer.advance(docID + 1 + random().nextInt((maxDoc - docID))); + } + + } while (scorer.docID() != Scorer.NO_MORE_DOCS && scorer.nextDoc() != Scorer.NO_MORE_DOCS); + } + + } + reader.close(); + dir.close(); + } + + /** + * tests retrieval of positions for terms that have a large number of + * occurrences to force test of buffer refill during positions iteration. + */ + public void testLargeNumberOfPositions() throws IOException { + Directory dir = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random(), dir, + + newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()))); + int howMany = 1000; + for (int i = 0; i < 39; i++) { + Document doc = new Document(); + StringBuilder builder = new StringBuilder(); + for (int j = 0; j < howMany; j++) { + if (j % 2 == 0) { + builder.append("even "); + } else { + builder.append("odd "); + } + } + doc.add(newField(fieldName, builder.toString(),TextField.TYPE_STORED)); + writer.addDocument(doc); + } + + // now do seaches + IndexReader reader = writer.getReader(); + writer.close(); + IndexSearcher searcher = new IndexSearcher(reader); + + for (int i = 0; i < 39 * RANDOM_MULTIPLIER; i++) { + TermQuery one = new TermQuery(new Term(fieldName, "even")); + IndexReaderContext topReaderContext = reader.getContext(); + List leaves = topReaderContext.leaves(); + Weight weight = one.createWeight(searcher); + Interval interval = null; + for (AtomicReaderContext atomicReaderContext : leaves) { + Scorer scorer = weight.scorer(atomicReaderContext, true, true, PostingFeatures.POSITIONS, null); + assertNotNull(scorer); + + int initDoc = 0; + int maxDoc = atomicReaderContext.reader().maxDoc(); + // initially advance or do next doc + if (random().nextBoolean()) { + initDoc = scorer.nextDoc(); + } else { + initDoc = scorer.advance(random().nextInt(maxDoc)); + } + String msg = "Iteration: " + i + " initDoc: " + initDoc; + IntervalIterator positions = scorer.intervals(false); + assertEquals(howMany / 2.f, positions.getScorer().freq(), 0.0); + assertEquals(scorer.docID(), positions.scorerAdvanced(scorer.docID())); + for (int j = 0; j < howMany; j += 2) { + assertNotNull("next returned nullat index: " + j + " with freq: " + + positions.getScorer().freq() + " -- " + msg, + (interval = positions.next())); + assertEquals("position missmatch index: " + j + " with freq: " + + positions.getScorer().freq() + " -- " + msg, j, interval.begin); + } + assertNull("next returned nonNull -- " + msg, + (interval = positions.next())); + + } + } + reader.close(); + dir.close(); + } + +} \ No newline at end of file diff --git a/lucene/core/src/test/org/apache/lucene/search/TestTermScorer.java b/lucene/core/src/test/org/apache/lucene/search/TestTermScorer.java index f5b62a3..b5bebee 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestTermScorer.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestTermScorer.java @@ -29,6 +29,7 @@ import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.SlowCompositeReaderWrapper; import org.apache.lucene.index.Term; +import org.apache.lucene.search.Weight.PostingFeatures; import org.apache.lucene.search.similarities.DefaultSimilarity; import org.apache.lucene.store.Directory; import org.apache.lucene.util.LuceneTestCase; @@ -78,7 +79,7 @@ public class TestTermScorer extends LuceneTestCase { Weight weight = indexSearcher.createNormalizedWeight(termQuery); assertTrue(indexSearcher.getTopReaderContext() instanceof AtomicReaderContext); AtomicReaderContext context = (AtomicReaderContext)indexSearcher.getTopReaderContext(); - Scorer ts = weight.scorer(context, true, true, context.reader().getLiveDocs()); + Scorer ts = weight.scorer(context, true, true, PostingFeatures.DOCS_AND_FREQS, context.reader().getLiveDocs()); // we have 2 documents with the term all in them, one document for all the // other values final List docs = new ArrayList(); @@ -140,7 +141,7 @@ public class TestTermScorer extends LuceneTestCase { Weight weight = indexSearcher.createNormalizedWeight(termQuery); assertTrue(indexSearcher.getTopReaderContext() instanceof AtomicReaderContext); AtomicReaderContext context = (AtomicReaderContext) indexSearcher.getTopReaderContext(); - Scorer ts = weight.scorer(context, true, true, context.reader().getLiveDocs()); + Scorer ts = weight.scorer(context, true, true, PostingFeatures.DOCS_AND_FREQS, context.reader().getLiveDocs()); assertTrue("next did not return a doc", ts.nextDoc() != DocIdSetIterator.NO_MORE_DOCS); assertTrue("score is not correct", ts.score() == 1.6931472f); @@ -159,7 +160,7 @@ public class TestTermScorer extends LuceneTestCase { Weight weight = indexSearcher.createNormalizedWeight(termQuery); assertTrue(indexSearcher.getTopReaderContext() instanceof AtomicReaderContext); AtomicReaderContext context = (AtomicReaderContext) indexSearcher.getTopReaderContext(); - Scorer ts = weight.scorer(context, true, true, context.reader().getLiveDocs()); + Scorer ts = weight.scorer(context, true, true, PostingFeatures.DOCS_AND_FREQS, context.reader().getLiveDocs()); assertTrue("Didn't skip", ts.advance(3) != DocIdSetIterator.NO_MORE_DOCS); // The next doc should be doc 5 assertTrue("doc should be number 5", ts.docID() == 5); diff --git a/lucene/core/src/test/org/apache/lucene/search/intervals/TestBasicIntervals.java b/lucene/core/src/test/org/apache/lucene/search/intervals/TestBasicIntervals.java new file mode 100644 index 0000000..1451d83 --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/search/intervals/TestBasicIntervals.java @@ -0,0 +1,437 @@ +package org.apache.lucene.search.intervals; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.TextField; +import org.apache.lucene.index.AtomicReader; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.RandomIndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.*; +import org.apache.lucene.search.BooleanClause.Occur; +import org.apache.lucene.search.Weight.PostingFeatures; +import org.apache.lucene.search.intervals.BlockIntervalIterator; +import org.apache.lucene.search.intervals.Interval; +import org.apache.lucene.search.intervals.IntervalFilter; +import org.apache.lucene.search.intervals.IntervalFilterQuery; +import org.apache.lucene.search.intervals.IntervalIterator; +import org.apache.lucene.search.intervals.WithinIntervalFilter; +import org.apache.lucene.search.intervals.WithinOrderedFilter; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.LuceneTestCase; + +import java.io.IOException; + +public class TestBasicIntervals extends LuceneTestCase { + private IndexSearcher searcher; + private IndexReader reader; + private Directory directory; + + public static final String field = "field"; + + @Override + public void setUp() throws Exception { + super.setUp(); + directory = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random(), directory, + newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())) + .setMergePolicy(newLogMergePolicy())); + for (int i = 0; i < docFields.length; i++) { + Document doc = new Document(); + doc.add(newField(field, docFields[i], TextField.TYPE_STORED)); + writer.addDocument(doc); + } + writer.forceMerge(1); + reader = writer.getReader(); + writer.close(); + searcher = newSearcher(reader); + } + + @Override + public void tearDown() throws Exception { + reader.close(); + directory.close(); + super.tearDown(); + } + + private String[] docFields = { + "w1 w2 w3 w4 w5", //0 + "w1 w3 w2 w3",//1 + "w1 xx w2 yy w3",//2 + "w1 w3 xx w2 yy w3",//3 + "u2 u2 u1", //4 + "u2 xx u2 u1",//5 + "u2 u2 xx u1", //6 + "u2 xx u2 yy u1", //7 + "u2 xx u1 u2",//8 + "u1 u2 xx u2",//9 + "u2 u1 xx u2",//10 + "t1 t2 t1 t3 t2 t3"};//11 + + public TermQuery makeTermQuery(String text) { + return new TermQuery(new Term(field, text)); + } + + private void checkHits(Query query, int[] results) throws IOException { + CheckHits.checkHits(random(), query, field, searcher, results); + } + + private void orderedSlopTest3SQ(Query q1, Query q2, Query q3, int slop, + int[] expectedDocs) throws IOException { + BooleanQuery query = new BooleanQuery(); + query.add(q1, Occur.MUST); + query.add(q2, Occur.MUST); + query.add(q3, Occur.MUST); + Query snq = new IntervalFilterQuery(query, new WithinOrderedFilter(slop)); + checkHits(snq, expectedDocs); + } + + public void orderedSlopTest3(int slop, int[] expectedDocs) throws IOException { + orderedSlopTest3SQ(makeTermQuery("w1"), makeTermQuery("w2"), + makeTermQuery("w3"), slop, expectedDocs); + } + + public void orderedSlopTest3Equal(int slop, int[] expectedDocs) + throws IOException { + orderedSlopTest3SQ(makeTermQuery("w1"), makeTermQuery("w3"), + makeTermQuery("w3"), slop, expectedDocs); + } + + public void orderedSlopTest1Equal(int slop, int[] expectedDocs) + throws IOException { + orderedSlopTest3SQ(makeTermQuery("u2"), makeTermQuery("u2"), + makeTermQuery("u1"), slop, expectedDocs); + } + + public void testNearOrdered01() throws Exception { + orderedSlopTest3(0, new int[] {0}); + } + + public void testNearOrdered02() throws Exception { + orderedSlopTest3(1, new int[] {0, 1}); + } + + public void testNearOrdered03() throws Exception { + orderedSlopTest3(2, new int[] {0, 1, 2}); + } + + public void testNearOrdered04() throws Exception { + orderedSlopTest3(3, new int[] {0, 1, 2, 3}); + } + + public void testNearOrdered05() throws Exception { + orderedSlopTest3(4, new int[] {0, 1, 2, 3}); + } + + public void testNearOrderedEqual01() throws Exception { + orderedSlopTest3Equal(0, new int[] {}); + } + + public void testNearOrderedEqual02() throws Exception { + orderedSlopTest3Equal(1, new int[] {1}); + } + + public void testNearOrderedEqual03() throws Exception { + orderedSlopTest3Equal(2, new int[] {1}); + } + + public void testNearOrderedEqual04() throws Exception { + orderedSlopTest3Equal(3, new int[] {1, 3}); + } + + public void testNearOrderedEqual11() throws Exception { + orderedSlopTest1Equal(0, new int[] {4}); + } + + public void testNearOrderedEqual12() throws Exception { + orderedSlopTest1Equal(0, new int[] {4}); + } + + public void testNearOrderedEqual13() throws Exception { + orderedSlopTest1Equal(1, new int[] {4, 5, 6}); + } + + public void testNearOrderedEqual14() throws Exception { + orderedSlopTest1Equal(2, new int[] {4, 5, 6, 7}); + } + + public void testNearOrderedEqual15() throws Exception { + orderedSlopTest1Equal(3, new int[] {4, 5, 6, 7}); + } + + public void testNearOrderedOverlap() throws Exception { + BooleanQuery query = new BooleanQuery(); //"t1 t2 t1 t3 t2 t3" + query.add(new BooleanClause(new TermQuery(new Term(field, "t1")), + Occur.MUST)); + query.add(new BooleanClause(new TermQuery(new Term(field, "t2")), + Occur.MUST)); + query.add(new BooleanClause(new TermQuery(new Term(field, "t3")), + Occur.MUST)); + IntervalFilterQuery positionFilterQuery = new IntervalFilterQuery( query, new WithinOrderedFilter(3)); + + Query rewrite = this.searcher.rewrite(positionFilterQuery); + AtomicReader r = this.reader.getContext().leaves().get(0).reader(); + Weight createWeight = rewrite.createWeight(new IndexSearcher(r)); + + Scorer scorer = createWeight.scorer(r.getContext(), random() + .nextBoolean(), true, PostingFeatures.POSITIONS, r.getLiveDocs()); + IntervalIterator positions = scorer.intervals(false); + positions.scorer.advance(11); + positions.scorerAdvanced(11); + Interval interval = positions.next(); + assertNotNull("first range", interval); + assertEquals("first doc", 11, positions.docID()); + assertEquals("first start", 0, interval.begin); + assertEquals("first end", 3, interval.end); + + + interval = positions.next(); + assertNotNull("second range", interval); + assertEquals("second doc", 11, positions.docID()); + assertEquals("second start", 2, interval.begin); + assertEquals("second end", 5, interval.end); + + assertNull("third range", positions.next()); + } + + public static class BlockPositionIteratorFilter implements IntervalFilter { + + @Override + public IntervalIterator filter(boolean collectIntervals, IntervalIterator iter) { + return new BlockIntervalIterator(collectIntervals, iter); + } + + } + + private int advanceIter(IntervalIterator iter, int pos) throws IOException { + return iter.scorerAdvanced(iter.scorer.advance(pos)); + } + public void testNearUnOrdered() throws Exception { + { + BooleanQuery query = new BooleanQuery(); + query.add(makeTermQuery("u1"), Occur.MUST); + query.add(makeTermQuery("u2"), Occur.MUST); + Query snq = new IntervalFilterQuery(query, new WithinIntervalFilter( + 0)); + Query rewrite = this.searcher.rewrite(snq); + AtomicReader r = this.reader.getContext().leaves().get(0).reader(); + Weight createWeight = rewrite.createWeight(new IndexSearcher(r)); + + Scorer scorer = createWeight.scorer(r.getContext(), random() + .nextBoolean(), true, PostingFeatures.POSITIONS, r.getLiveDocs()); + IntervalIterator positions = scorer.intervals(false); + advanceIter(positions, 4); + + Interval interval = positions.next(); + assertNotNull("Does not have next and it should", interval); + assertEquals("doc", 4, positions.docID()); + assertEquals("start " + interval, 1, interval.begin); + assertEquals("end", 2, interval.end); + + advanceIter(positions, 5); + interval = positions.next(); + assertNotNull("Does not have next and it should", interval); + assertEquals("doc", 5, positions.docID()); + assertEquals("start", 2, interval.begin); + assertEquals("end", 3, interval.end); + + advanceIter(positions, 8); + interval = positions.next(); + assertNotNull("Does not have next and it should", interval); + assertEquals("doc", 8, positions.docID()); + assertEquals("start", 2, interval.begin); + assertEquals("end", 3, interval.end); + + advanceIter(positions, 9); + interval = positions.next(); + assertNotNull("Does not have next and it should", interval); + assertEquals("doc", 9, positions.docID()); + assertEquals("start", 0, interval.begin); + assertEquals("end", 1, interval.end); + + advanceIter(positions, 10); + interval = positions.next(); + assertNotNull("Does not have next and it should", interval); + assertEquals("doc", 10, positions.docID()); + assertEquals("start", 0, interval.begin); + assertEquals("end", 1, interval.end); + + + assertNull("Has next and it shouldn't: " + positions.docID(), positions.next()); + } + + { + // ((u1 near u2) near u2) + BooleanQuery query = new BooleanQuery(); + query.add(makeTermQuery("u1"), Occur.MUST); + query.add(makeTermQuery("u2"), Occur.MUST); + Query nearQuery = new IntervalFilterQuery(query, + new WithinIntervalFilter(0)); + + BooleanQuery topLevel = new BooleanQuery(); + topLevel.add(nearQuery, Occur.MUST); + topLevel.add(makeTermQuery("u2"), Occur.MUST); + + + Query rewrite = this.searcher.rewrite(new IntervalFilterQuery(topLevel, + new WithinIntervalFilter(1))); + AtomicReader r = this.reader.getContext().leaves().get(0).reader(); + Weight createWeight = rewrite.createWeight(new IndexSearcher(r)); + Scorer scorer = createWeight.scorer(r.getContext(), random() + .nextBoolean(), true, PostingFeatures.POSITIONS, r.getLiveDocs()); + + IntervalIterator iterator = scorer.intervals(false); + assertEquals(4, advanceIter(iterator, 4)); + Interval interval = iterator.next(); + + assertNotNull("Does not have next and it should", interval); + // unordered spans can be subsets + assertEquals("doc", 4, iterator.docID()); + assertEquals("start", 1, interval.begin); + assertEquals("end", 2, interval.end); + + advanceIter(iterator, 5); + + + interval = iterator.next(); + assertNotNull("Does not have next and it should", interval); + assertEquals("doc", 5, iterator.docID()); + assertEquals("start", 2, interval.begin); + assertEquals("end", 3, interval.end); + + advanceIter(iterator, 8); // (u2 xx (u1 u2)) + + interval = iterator.next(); + assertNotNull("Does not have next and it should", interval); + assertEquals("doc", 8, iterator.docID()); + assertEquals("start", 2, interval.begin); + assertEquals("end", 3, interval.end); + + advanceIter(iterator, 9); // u2 u1 xx u2 + interval = iterator.next(); + assertNotNull("Does not have next and it should", interval); + assertEquals("doc", 9, iterator.docID()); + assertEquals("start", 0, interval.begin); + assertEquals("end", 1, interval.end); + + interval = iterator.next(); + assertNull("Has next and it shouldn't", interval); + + advanceIter(iterator, 10); + interval = iterator.next(); + assertNotNull("Does not have next and it should", interval); + assertEquals("doc", 10, iterator.docID()); + assertEquals("start", 0, interval.begin); + assertEquals("end", 1, interval.end); + + interval = iterator.next(); + assertNull("Has next and it shouldn't " + interval, interval); + } + } + + private IntervalIterator orIterator(String[] terms) throws Exception { + BooleanQuery query = new BooleanQuery(); + + for (int i = 0; i < terms.length; i++) { + query.add(makeTermQuery(terms[i]), Occur.SHOULD); + } + Query rewrite = this.searcher.rewrite(query); + AtomicReader r = this.reader.getContext().leaves().get(0).reader(); + Weight createWeight = rewrite.createWeight(new IndexSearcher(r)); + + Scorer scorer = createWeight.scorer(r.getContext(), true, true, PostingFeatures.POSITIONS, r.getLiveDocs()); + return scorer.intervals(false); + } + + private IntervalIterator tstNextPosition( + IntervalIterator iterator, int doc, int start, int end) + throws Exception { + if (iterator.docID() != doc) { + iterator.scorer.advance(doc); + iterator.scorerAdvanced(doc); + } + assertEquals("doc", doc, iterator.docID()); + Interval next = iterator.next(); + assertNotNull("next", next); + assertEquals("begin", start, next.begin); + assertEquals("end", end, next.end + 1); + return iterator; + } + + public void testOrSingle() throws Exception { + IntervalIterator spans = orIterator(new String[] {"w5"}); + tstNextPosition(spans, 0, 4, 5); + assertNull("final next", spans.next()); + } + + public void testOrMovesForward() throws Exception { + IntervalIterator iterator = orIterator(new String[] {"w1", "xx"}); + advanceIter(iterator, 0); + assertNotNull(iterator.next()); + int doc = iterator.docID(); + assertEquals(0, doc); + assertEquals(1, advanceIter(iterator, 1)); + + } + + public void testSpanOrDouble() throws Exception { + IntervalIterator iterator = orIterator(new String[] {"w5", "yy"}); + tstNextPosition(iterator, 0, 4, 5); + tstNextPosition(iterator, 2, 3, 4); + tstNextPosition(iterator, 3, 4, 5); + tstNextPosition(iterator, 7, 3, 4); + assertNull("final next", iterator.next()); + } + + public void testOrDoubleSkip() throws Exception { + IntervalIterator iterator = orIterator(new String[] {"w5", "yy"}); + iterator.scorer.advance(3); + assertEquals("initial skipTo", 3, iterator.scorerAdvanced(3)); + assertEquals("doc", 3, iterator.docID()); + Interval next = iterator.next(); + assertEquals("start", 4, next.begin); + assertEquals("end", 4, next.end); + tstNextPosition(iterator, 7, 3, 4); + assertNull("final next", iterator.next()); + } + + public void testOrUnused() throws Exception { + IntervalIterator iterator = orIterator(new String[] {"w5", + "unusedTerm", "yy"}); + tstNextPosition(iterator, 0, 4, 5); + tstNextPosition(iterator, 2, 3, 4); + tstNextPosition(iterator, 3, 4, 5); + tstNextPosition(iterator, 7, 3, 4); + assertNull("final next", iterator.next()); + } + + public void testOrTripleSameDoc() throws Exception { + IntervalIterator iterator = orIterator(new String[] {"t1", "t2", + "t3"}); + tstNextPosition(iterator, 11, 0, 1); + tstNextPosition(iterator, 11, 1, 2); + tstNextPosition(iterator, 11, 2, 3); + tstNextPosition(iterator, 11, 3, 4); + tstNextPosition(iterator, 11, 4, 5); + tstNextPosition(iterator, 11, 5, 6); + assertNull("final next", iterator.next()); + } + +} diff --git a/lucene/core/src/test/org/apache/lucene/search/intervals/TestBlockIntervalIterator.java b/lucene/core/src/test/org/apache/lucene/search/intervals/TestBlockIntervalIterator.java new file mode 100644 index 0000000..2a9de6c --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/search/intervals/TestBlockIntervalIterator.java @@ -0,0 +1,176 @@ +package org.apache.lucene.search.intervals; +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.TextField; +import org.apache.lucene.index.*; +import org.apache.lucene.search.*; +import org.apache.lucene.search.BooleanClause.Occur; +import org.apache.lucene.search.Weight.PostingFeatures; +import org.apache.lucene.search.intervals.BlockIntervalIterator; +import org.apache.lucene.search.intervals.Interval; +import org.apache.lucene.search.intervals.IntervalFilter; +import org.apache.lucene.search.intervals.IntervalFilterQuery; +import org.apache.lucene.search.intervals.IntervalIterator; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.LuceneTestCase; + +import java.io.IOException; +import java.util.List; + +public class TestBlockIntervalIterator extends LuceneTestCase { + + private static final void addDocs(RandomIndexWriter writer) throws CorruptIndexException, IOException { + { + Document doc = new Document(); + doc.add(newField( + "field", + "Pease porridge hot! Pease porridge cold! Pease porridge in the pot nine days old! Some like it hot, some" + + " like it cold, Some like it in the pot nine days old! Pease porridge hot! Pease porridge cold!", + TextField.TYPE_STORED)); + writer.addDocument(doc); + } + + { + Document doc = new Document(); + doc.add(newField( + "field", + "Pease porridge cold! Pease porridge hot! Pease porridge in the pot nine days old! Some like it cold, some" + + " like it hot, Some like it in the pot nine days old! Pease porridge cold! Pease porridge hot!", + TextField.TYPE_STORED)); + writer.addDocument(doc); + } + } + public void testExactPhraseBooleanConjunction() throws IOException { + Directory directory = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random(), directory, + newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()))); + addDocs(writer); + IndexReader reader = writer.getReader(); + IndexSearcher searcher = new IndexSearcher(reader); + writer.close(); + BooleanQuery query = new BooleanQuery(); + query.add(new BooleanClause(new TermQuery(new Term("field", "pease")), Occur.MUST)); + query.add(new BooleanClause(new TermQuery(new Term("field", "porridge")), Occur.MUST)); + query.add(new BooleanClause(new TermQuery(new Term("field", "hot!")), Occur.MUST)); + { + IntervalFilterQuery filter = new IntervalFilterQuery(query, new BlockPositionIteratorFilter()); + TopDocs search = searcher.search(filter, 10); + ScoreDoc[] scoreDocs = search.scoreDocs; + assertEquals(2, search.totalHits); + assertEquals(0, scoreDocs[0].doc); + assertEquals(1, scoreDocs[1].doc); + } + query.add(new BooleanClause(new TermQuery(new Term("field", "pease")), Occur.MUST)); + query.add(new BooleanClause(new TermQuery(new Term("field", "porridge")), Occur.MUST)); + query.add(new BooleanClause(new TermQuery(new Term("field", "cold!")), Occur.MUST)); + + { + IntervalFilterQuery filter = new IntervalFilterQuery(query, new BlockPositionIteratorFilter()); + TopDocs search = searcher.search(filter, 10); + ScoreDoc[] scoreDocs = search.scoreDocs; + assertEquals(1, search.totalHits); + assertEquals(0, scoreDocs[0].doc); + } + + query = new BooleanQuery(); + query.add(new BooleanClause(new TermQuery(new Term("field", "pease")), Occur.MUST)); + query.add(new BooleanClause(new TermQuery(new Term("field", "hot!")), Occur.MUST)); + { + IntervalFilterQuery filter = new IntervalFilterQuery(query, new BlockPositionIteratorFilter()); + TopDocs search = searcher.search(filter, 10); + assertEquals(0, search.totalHits); + } + reader.close(); + directory.close(); + } + + public void testBlockPositionIterator() throws IOException { + Directory directory = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random(), directory, + newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()))); + addDocs(writer); + IndexReader reader = writer.getReader(); + IndexSearcher searcher = new IndexSearcher(reader); + writer.close(); + BooleanQuery query = new BooleanQuery(); + query.add(new BooleanClause(new TermQuery(new Term("field", "pease")), Occur.MUST)); + query.add(new BooleanClause(new TermQuery(new Term("field", "porridge")), Occur.MUST)); + query.add(new BooleanClause(new TermQuery(new Term("field", "hot!")), Occur.MUST)); + + Weight weight = query.createWeight(searcher); + IndexReaderContext topReaderContext = searcher.getTopReaderContext(); + List leaves = topReaderContext.leaves(); + assertEquals(1, leaves.size()); + for (AtomicReaderContext atomicReaderContext : leaves) { + Scorer scorer = weight.scorer(atomicReaderContext, true, true, PostingFeatures.POSITIONS, atomicReaderContext.reader().getLiveDocs()); + { + int nextDoc = scorer.nextDoc(); + assertEquals(0, nextDoc); + IntervalIterator positions = new BlockIntervalIterator(false, scorer.intervals(false)); + assertEquals(0, positions.scorerAdvanced(0)); + Interval interval = null; + int[] start = new int[] {0, 31}; + int[] end = new int[] {2, 33}; + // {start}term{end} - end is pos+1 + // {0}Pease porridge hot!{0} Pease porridge cold! Pease porridge in the pot nine days old! Some like it hot, some" + // like it cold, Some like it in the pot nine days old! {1}Pease porridge hot!{1} Pease porridge cold!", + for (int j = 0; j < end.length; j++) { + interval = positions.next(); + assertNotNull(interval); + assertEquals(start[j], interval.begin); + assertEquals(end[j], interval.end); + } + assertNull(positions.next()); + } + { + int nextDoc = scorer.nextDoc(); + assertEquals(1, nextDoc); + IntervalIterator positions = new BlockIntervalIterator(false, scorer.intervals(false)); + assertEquals(1, positions.scorerAdvanced(1)); + Interval interval = null; + int[] start = new int[] {3, 34}; + int[] end = new int[] {5, 36}; + // {start}term{end} - end is pos+1 + // Pease porridge cold! {0}Pease porridge hot!{0} Pease porridge in the pot nine days old! Some like it cold, some + // like it hot, Some like it in the pot nine days old! Pease porridge cold! {1}Pease porridge hot{1}! + for (int j = 0; j < end.length; j++) { + interval = positions.next(); + assertNotNull(interval); + assertEquals(j + "", start[j], interval.begin); + assertEquals(j+ "", end[j], interval.end); + } + assertNull(positions.next()); + } + } + + + reader.close(); + directory.close(); + } + + + public static class BlockPositionIteratorFilter implements IntervalFilter { + + @Override + public IntervalIterator filter(boolean collectIntervals, IntervalIterator iter) { + return new BlockIntervalIterator(collectIntervals, iter); + } + + } +} diff --git a/lucene/core/src/test/org/apache/lucene/search/intervals/TestBrouwerianQuery.java b/lucene/core/src/test/org/apache/lucene/search/intervals/TestBrouwerianQuery.java new file mode 100644 index 0000000..8ceb100 --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/search/intervals/TestBrouwerianQuery.java @@ -0,0 +1,164 @@ +package org.apache.lucene.search.intervals; +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +import java.io.IOException; +import java.util.List; + +import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.TextField; +import org.apache.lucene.index.AtomicReaderContext; +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexReaderContext; +import org.apache.lucene.index.RandomIndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanClause.Occur; +import org.apache.lucene.search.Weight.PostingFeatures; +import org.apache.lucene.search.intervals.Interval; +import org.apache.lucene.search.intervals.IntervalIterator; +import org.apache.lucene.search.intervals.NonOverlappingQuery; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.ScoreDoc; +import org.apache.lucene.search.Scorer; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.TopDocs; +import org.apache.lucene.search.Weight; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.LuceneTestCase; + +public class TestBrouwerianQuery extends LuceneTestCase { + + private static final void addDocs(RandomIndexWriter writer) throws CorruptIndexException, IOException { + { + Document doc = new Document(); + doc.add(newField( + "field", + "The quick brown fox jumps over the lazy dog", + TextField.TYPE_STORED)); + writer.addDocument(doc); + } + + { + Document doc = new Document(); + doc.add(newField( + "field", + "The quick brown duck jumps over the lazy dog with the quick brown fox", + TextField.TYPE_STORED)); + writer.addDocument(doc); + } + } + + public void testBrouwerianBooleanQuery() throws IOException { + Directory directory = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random(), directory, + newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()))); + addDocs(writer); + + IndexReader reader = writer.getReader(); + IndexSearcher searcher = new IndexSearcher(reader); + writer.close(); + BooleanQuery query = new BooleanQuery(); + query.add(new BooleanClause(new TermQuery(new Term("field", "the")), Occur.MUST)); + query.add(new BooleanClause(new TermQuery(new Term("field", "quick")), Occur.MUST)); + query.add(new BooleanClause(new TermQuery(new Term("field", "jumps")), Occur.MUST)); + BooleanQuery sub = new BooleanQuery(); + sub.add(new BooleanClause(new TermQuery(new Term("field", "fox")), Occur.MUST)); + NonOverlappingQuery q = new NonOverlappingQuery(query, sub); + TopDocs search = searcher.search(q, 10); + ScoreDoc[] scoreDocs = search.scoreDocs; + assertEquals(1, search.totalHits); + assertEquals(1, scoreDocs[0].doc); + + reader.close(); + directory.close(); + } + + public void testBrouwerianBooleanQueryExcludedDoesNotExist() throws IOException { + Directory directory = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random(), directory, + newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()))); + addDocs(writer); + + IndexReader reader = writer.getReader(); + IndexSearcher searcher = new IndexSearcher(reader); + writer.close(); + BooleanQuery query = new BooleanQuery(); + query.add(new BooleanClause(new TermQuery(new Term("field", "the")), Occur.MUST)); + query.add(new BooleanClause(new TermQuery(new Term("field", "quick")), Occur.MUST)); + query.add(new BooleanClause(new TermQuery(new Term("field", "jumps")), Occur.MUST)); + BooleanQuery sub = new BooleanQuery(); + sub.add(new BooleanClause(new TermQuery(new Term("field", "blox")), Occur.MUST)); + NonOverlappingQuery q = new NonOverlappingQuery(query, sub); + TopDocs search = searcher.search(q, 10); + ScoreDoc[] scoreDocs = search.scoreDocs; + assertEquals(2, search.totalHits); + assertEquals(0, scoreDocs[0].doc); + assertEquals(1, scoreDocs[1].doc); + + + reader.close(); + directory.close(); + } + + public void testPositions() throws IOException { + Directory directory = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random(), directory, + newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()))); + addDocs(writer); + + IndexReader reader = writer.getReader(); + IndexSearcher searcher = new IndexSearcher(reader); + writer.close(); + BooleanQuery query = new BooleanQuery(); + query.add(new BooleanClause(new TermQuery(new Term("field", "the")), Occur.MUST)); + query.add(new BooleanClause(new TermQuery(new Term("field", "quick")), Occur.MUST)); + query.add(new BooleanClause(new TermQuery(new Term("field", "jumps")), Occur.MUST)); + BooleanQuery sub = new BooleanQuery(); + sub.add(new BooleanClause(new TermQuery(new Term("field", "fox")), Occur.MUST)); + NonOverlappingQuery q = new NonOverlappingQuery(query, sub); + Weight weight = q.createWeight(searcher); + IndexReaderContext topReaderContext = searcher.getTopReaderContext(); + List leaves = topReaderContext.leaves(); + assertEquals(1, leaves.size()); + for (AtomicReaderContext atomicReaderContext : leaves) { + Scorer scorer = weight.scorer(atomicReaderContext, true, true, PostingFeatures.POSITIONS, atomicReaderContext.reader().getLiveDocs()); + IntervalIterator positions = scorer.intervals(false); + int nextDoc = scorer.nextDoc(); + assertEquals(1, positions.scorerAdvanced(nextDoc)); + + + assertEquals(1, nextDoc); + Interval interval = null; + int[] start = new int[] {0, 1, 4}; + int[] end = new int[] {4, 6, 11}; + for (int j = 0; j < end.length; j++) { + interval = positions.next(); + assertNotNull("" + j, interval); + assertEquals(start[j], interval.begin); + assertEquals(end[j], interval.end); + } + assertNull(positions.next()); + assertEquals(Scorer.NO_MORE_DOCS, scorer.nextDoc()); + reader.close(); + directory.close(); + } + + } +} diff --git a/lucene/core/src/test/org/apache/lucene/search/intervals/TestConjunctionIntervalIterator.java b/lucene/core/src/test/org/apache/lucene/search/intervals/TestConjunctionIntervalIterator.java new file mode 100644 index 0000000..8186469 --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/search/intervals/TestConjunctionIntervalIterator.java @@ -0,0 +1,153 @@ +package org.apache.lucene.search.intervals; +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.TextField; +import org.apache.lucene.index.*; +import org.apache.lucene.search.*; +import org.apache.lucene.search.BooleanClause.Occur; +import org.apache.lucene.search.Weight.PostingFeatures; +import org.apache.lucene.search.intervals.Interval; +import org.apache.lucene.search.intervals.IntervalFilterQuery; +import org.apache.lucene.search.intervals.IntervalIterator; +import org.apache.lucene.search.intervals.RangeIntervalFilter; +import org.apache.lucene.search.intervals.WithinIntervalFilter; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.LuceneTestCase; + +import java.io.IOException; +import java.util.List; + +public class TestConjunctionIntervalIterator extends LuceneTestCase { + + private static final void addDocs(RandomIndexWriter writer) throws CorruptIndexException, IOException { + { + Document doc = new Document(); + doc.add(newField( + "field", + "Pease porridge hot! Pease porridge cold! Pease porridge in the pot nine days old! Some like it hot, some" + + " like it cold, Some like it in the pot nine days old! Pease porridge hot! Pease porridge cold!", + TextField.TYPE_STORED)); + writer.addDocument(doc); + } + + { + Document doc = new Document(); + doc.add(newField( + "field", + "Pease porridge cold! Pease porridge hot! Pease porridge in the pot nine days old! Some like it cold, some" + + " like it hot, Some like it in the pot nine days old! Pease porridge cold! Pease porridge hot!", + TextField.TYPE_STORED)); + writer.addDocument(doc); + } + } + public void testConjunctionPositionsBooleanQuery() throws IOException { + Directory directory = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random(), directory, + newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()))); + addDocs(writer); + + IndexReader reader = writer.getReader(); + IndexSearcher searcher = new IndexSearcher(reader); + writer.close(); + BooleanQuery query = new BooleanQuery(); + query.add(new BooleanClause(new TermQuery(new Term("field", "porridge")), Occur.MUST)); + query.add(new BooleanClause(new TermQuery(new Term("field", "pease")), Occur.MUST)); + query.add(new BooleanClause(new TermQuery(new Term("field", "hot!")), Occur.MUST)); + { + IntervalFilterQuery filter = new IntervalFilterQuery(query, new RangeIntervalFilter(0,3)); + TopDocs search = searcher.search(filter, 10); + ScoreDoc[] scoreDocs = search.scoreDocs; + assertEquals(1, search.totalHits); + assertEquals(0, scoreDocs[0].doc); + } + { + IntervalFilterQuery filter = new IntervalFilterQuery(query, new WithinIntervalFilter(3)); + TopDocs search = searcher.search(filter, 10); + ScoreDoc[] scoreDocs = search.scoreDocs; + assertEquals(2, search.totalHits); + assertEquals(0, scoreDocs[0].doc); + assertEquals(1, scoreDocs[1].doc); + } + + reader.close(); + directory.close(); + } + + public void testConjuctionPositionIterator() throws IOException { + Directory directory = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random(), directory, + newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()))); + addDocs(writer); + + IndexReader reader = writer.getReader(); + IndexSearcher searcher = new IndexSearcher(reader); + writer.close(); + BooleanQuery query = new BooleanQuery(); + query.add(new BooleanClause(new TermQuery(new Term("field", "porridge")), Occur.MUST)); + query.add(new BooleanClause(new TermQuery(new Term("field", "pease")), Occur.MUST)); + query.add(new BooleanClause(new TermQuery(new Term("field", "hot!")), Occur.MUST)); + Weight weight = query.createWeight(searcher); + IndexReaderContext topReaderContext = searcher.getTopReaderContext(); + List leaves = topReaderContext.leaves(); + assertEquals(1, leaves.size()); + for (AtomicReaderContext atomicReaderContext : leaves) { + Scorer scorer = weight.scorer(atomicReaderContext, true, true, PostingFeatures.POSITIONS, atomicReaderContext.reader().getLiveDocs()); + { + int nextDoc = scorer.nextDoc(); + assertEquals(0, nextDoc); + IntervalIterator positions = scorer.intervals(false); + assertEquals(0, positions.scorerAdvanced(nextDoc)); + Interval interval = null; + int[] start = new int[] {0, 1, 2, 31, 32, 33}; + int[] end = new int[] {2, 3, 4, 33, 34, 35}; + // {start}term{end} - end is pos+1 + // {0}Pease {1}porridge {2}hot!{0} Pease{1} porridge{2} cold! Pease porridge in the pot nine days old! Some like it hot, some" + // like it cold, Some like it in the pot nine days old! {3}Pease {4}porridge {5}hot!{3} Pease{4} porridge{5} cold!", + for (int j = 0; j < end.length; j++) { + interval = positions.next(); + assertNotNull("" + j, interval); + assertEquals(start[j], interval.begin); + assertEquals(end[j], interval.end); + } + assertNull(positions.next()); + } + { + int nextDoc = scorer.nextDoc(); + assertEquals(1, nextDoc); + IntervalIterator positions = scorer.intervals(false); + assertEquals(1, positions.scorerAdvanced(nextDoc)); + Interval interval = null; + int[] start = new int[] {3, 4, 5, 34 }; + int[] end = new int[] {5, 6, 7, 36 }; + // {start}term{end} - end is pos+1 + // {0}Pease {1}porridge cold! {0}Pease {1}porridge {2}hot!{0} Pease{1} porridge{2} in the pot nine days old! Some like it cold, some + // like it hot, Some like it in the pot nine days old! Pease porridge cold! {4}Pease porridge hot{4}! + for (int j = 0; j < end.length; j++) { + interval = positions.next(); + assertNotNull(interval); + assertEquals(j + "", start[j], interval.begin); + assertEquals(j+ "", end[j], interval.end); + } + assertNull(positions.next()); + } + } + reader.close(); + directory.close(); + } +} diff --git a/lucene/core/src/test/org/apache/lucene/search/intervals/TestDisjunctionIntervalIterator.java b/lucene/core/src/test/org/apache/lucene/search/intervals/TestDisjunctionIntervalIterator.java new file mode 100644 index 0000000..1b17411 --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/search/intervals/TestDisjunctionIntervalIterator.java @@ -0,0 +1,229 @@ +package org.apache.lucene.search.intervals; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.TextField; +import org.apache.lucene.index.*; +import org.apache.lucene.search.*; +import org.apache.lucene.search.BooleanClause.Occur; +import org.apache.lucene.search.Weight.PostingFeatures; +import org.apache.lucene.search.intervals.Interval; +import org.apache.lucene.search.intervals.IntervalFilterQuery; +import org.apache.lucene.search.intervals.IntervalIterator; +import org.apache.lucene.search.intervals.RangeIntervalFilter; +import org.apache.lucene.search.intervals.WithinIntervalFilter; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.LuceneTestCase; + +import java.io.IOException; +import java.util.List; + +public class TestDisjunctionIntervalIterator extends LuceneTestCase { + private static final void addDocs(RandomIndexWriter writer) + throws CorruptIndexException, IOException { + { + Document doc = new Document(); + doc.add(newField( + "field", + "Pease porridge hot! Pease porridge cold! Pease porridge in the pot nine days old! Some like it hot, some" + + " like it cold, Some like it in the pot nine days old! Pease porridge hot! Pease porridge cold!", + TextField.TYPE_STORED)); + writer.addDocument(doc); + } + + { + Document doc = new Document(); + doc.add(newField( + "field", + "Pease porridge cold! Pease porridge hot! Pease porridge in the pot nine days old! Some like it cold, some" + + " like it hot, Some like it in the pot nine days old! Pease porridge cold! Pease porridge hot!", + TextField.TYPE_STORED)); + writer.addDocument(doc); + } + } + + public void testDisjunctionPositionsBooleanQuery() throws IOException { + Directory directory = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random(), directory, + newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()))); + addDocs(writer); + + IndexReader reader = writer.getReader(); + IndexSearcher searcher = new IndexSearcher(reader); + writer.close(); + BooleanQuery query = new BooleanQuery(); + query.add(new BooleanClause(new TermQuery(new Term("field", "porridge")), + Occur.SHOULD)); + query.add(new BooleanClause(new TermQuery(new Term("field", "pease")), + Occur.SHOULD)); + query.add(new BooleanClause(new TermQuery(new Term("field", "hot!")), + Occur.SHOULD)); + { + IntervalFilterQuery filter = new IntervalFilterQuery(query, + new RangeIntervalFilter(0, 3)); + TopDocs search = searcher.search(filter, 10); + ScoreDoc[] scoreDocs = search.scoreDocs; + assertEquals(2, search.totalHits); + assertEquals(0, scoreDocs[0].doc); + } + { + IntervalFilterQuery filter = new IntervalFilterQuery(query, + new WithinIntervalFilter(3)); + TopDocs search = searcher.search(filter, 10); + ScoreDoc[] scoreDocs = search.scoreDocs; + assertEquals(2, search.totalHits); + assertEquals(0, scoreDocs[0].doc); + assertEquals(1, scoreDocs[1].doc); + } + + reader.close(); + directory.close(); + } + + public void testBasic() throws IOException { + Directory directory = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random(), directory, + newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()))); + { + Document doc = new Document(); + doc.add(newField("field", "the quick brown fox", TextField.TYPE_STORED)); + writer.addDocument(doc); + } + { + Document doc = new Document(); + doc.add(newField("field", "the brown quick fox", TextField.TYPE_STORED)); + writer.addDocument(doc); + } + + IndexReader reader = writer.getReader(); + IndexSearcher searcher = new IndexSearcher(reader); + writer.close(); + + BooleanQuery query = new BooleanQuery(); + query.add(new BooleanClause(new TermQuery(new Term("field", "quick")), + Occur.SHOULD)); + query.add(new BooleanClause(new TermQuery(new Term("field", "brown")), + Occur.SHOULD)); + + Weight weight = query.createWeight(searcher); + IndexReaderContext topReaderContext = searcher.getTopReaderContext(); + List leaves = topReaderContext.leaves(); + assertEquals(1, leaves.size()); + Scorer scorer = weight.scorer(leaves.get(0), + true, true, PostingFeatures.POSITIONS, leaves.get(0).reader().getLiveDocs()); + IntervalIterator positions = scorer.intervals(false); + for (int i = 0; i < 2; i++) { + + int nextDoc = scorer.nextDoc(); + assertEquals(i, nextDoc); + assertEquals(i, positions.scorerAdvanced(nextDoc)); + Interval interval = positions.next(); + assertEquals(1, interval.begin); + assertEquals(1, interval.end); + + interval = positions.next(); + assertEquals(2, interval.begin); + assertEquals(2, interval.end); + assertNull(positions.next()); + } + reader.close(); + directory.close(); + + } + + public void testDisjunctionPositionIterator() throws IOException { + Directory directory = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random(), directory, + newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()))); + addDocs(writer); + + IndexReader reader = writer.getReader(); + IndexSearcher searcher = new IndexSearcher(reader); + writer.close(); + BooleanQuery query1 = new BooleanQuery(); + query1.add(new BooleanClause(new TermQuery(new Term("field", "porridge")), + Occur.MUST)); + query1.add(new BooleanClause(new TermQuery(new Term("field", "pease")), + Occur.MUST)); + query1.add(new BooleanClause(new TermQuery(new Term("field", "hot!")), + Occur.MUST)); + + BooleanQuery query2 = new BooleanQuery(); + query2.add(new BooleanClause(new TermQuery(new Term("field", "pease")), + Occur.MUST)); + query2.add(new BooleanClause(new TermQuery(new Term("field", "porridge")), + Occur.MUST)); + query2.add(new BooleanClause(new TermQuery(new Term("field", "hot!")), + Occur.MUST)); + + BooleanQuery query = new BooleanQuery(); + query.add(new BooleanClause(query1, Occur.SHOULD)); + query.add(new BooleanClause(query2, Occur.SHOULD)); + Weight weight = query.createWeight(searcher); + IndexReaderContext topReaderContext = searcher.getTopReaderContext(); + List leaves = topReaderContext.leaves(); + assertEquals(1, leaves.size()); + for (AtomicReaderContext atomicReaderContext : leaves) { + Scorer scorer = weight.scorer(atomicReaderContext, + true, true, PostingFeatures.POSITIONS, atomicReaderContext.reader().getLiveDocs()); + { + int nextDoc = scorer.nextDoc(); + assertEquals(0, nextDoc); + IntervalIterator positions = scorer.intervals(false); + assertEquals(0, positions.scorerAdvanced(nextDoc)); + Interval interval = null; + int[] start = new int[] { 0, 1, 2, 31, 32, 33 }; + int[] end = new int[] { 2, 3, 4, 33, 34, 35 }; + // {start}term{end} - end is pos+1 + // {0}Pease {1}porridge {2}hot!{0} Pease{1} porridge{2} cold! Pease porridge in the pot nine days old! Some like it hot, some" + // like it cold, Some like it in the pot nine days old! {3}Pease {4}porridge {5}hot!{3} Pease{4} porridge{5} cold!", + for (int j = 0; j < end.length; j++) { + interval = positions.next(); + assertNotNull("" + j, interval); + assertEquals(start[j], interval.begin); + assertEquals(end[j], interval.end); + } + assertNull(positions.next()); + } + { + int nextDoc = scorer.nextDoc(); + assertEquals(1, nextDoc); + IntervalIterator positions = scorer.intervals(false); + assertEquals(1, positions.scorerAdvanced(nextDoc)); + Interval interval = null; + int[] start = new int[] { 3, 4, 5, 34 }; + int[] end = new int[] { 5, 6, 7, 36 }; + // {start}term{end} - end is pos+1 + // {0}Pease {1}porridge cold! {0}Pease {1}porridge {2}hot!{0} Pease{1} porridge{2} in the pot nine days old! Some like it cold, some + // like it hot, Some like it in the pot nine days old! Pease porridge cold! {4}Pease porridge hot{4}! + for (int j = 0; j < end.length; j++) { + interval = positions.next(); + assertNotNull(interval); + assertEquals(j + "", start[j], interval.begin); + assertEquals(j + "", end[j], interval.end); + } + assertNull(positions.next()); + } + } + reader.close(); + directory.close(); + } +} + diff --git a/lucene/core/src/test/org/apache/lucene/search/intervals/TestIntervalFilterQueries.java b/lucene/core/src/test/org/apache/lucene/search/intervals/TestIntervalFilterQueries.java new file mode 100644 index 0000000..2458457 --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/search/intervals/TestIntervalFilterQueries.java @@ -0,0 +1,111 @@ +package org.apache.lucene.search.intervals; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.TextField; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.RandomIndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.*; +import org.apache.lucene.search.intervals.OrderedNearQuery; +import org.apache.lucene.search.intervals.UnorderedNearQuery; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.LuceneTestCase; + +import java.io.IOException; + +public class TestIntervalFilterQueries extends LuceneTestCase { + + private IndexSearcher searcher; + private IndexReader reader; + private Directory directory; + + public static final String field = "field"; + + @Override + public void setUp() throws Exception { + super.setUp(); + directory = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random(), directory, + newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())) + .setMergePolicy(newLogMergePolicy())); + for (int i = 0; i < docFields.length; i++) { + Document doc = new Document(); + doc.add(newField(field, docFields[i], TextField.TYPE_STORED)); + writer.addDocument(doc); + } + writer.forceMerge(1); + reader = writer.getReader(); + writer.close(); + searcher = newSearcher(reader); + } + + @Override + public void tearDown() throws Exception { + reader.close(); + directory.close(); + super.tearDown(); + } + + private String[] docFields = { + "w1 w2 w3 w4 w5 w6 w7 w8 w9 w10 w11 w12", //0 + "w1 w3 w4 w5 w6 w7 w8", //1 + "w1 w3 w10 w4 w5 w6 w7 w8", //2 + "w1 w3 w2 w4 w5 w6 w7 w8", //3 + }; + + public TermQuery makeTermQuery(String text) { + return new TermQuery(new Term(field, text)); + } + + private void checkHits(Query query, int[] results) throws IOException { + CheckHits.checkHits(random(), query, field, searcher, results); + } + + // or(w1 pre/2 w2, w1 pre/3 w10) + public void testOrNearNearQuery() throws IOException { + Query near1 = new OrderedNearQuery(2, makeTermQuery("w1"), makeTermQuery("w2")); + Query near2 = new OrderedNearQuery(3, makeTermQuery("w1"), makeTermQuery("w10")); + BooleanQuery bq = new BooleanQuery(); + bq.add(near1, BooleanClause.Occur.SHOULD); + bq.add(near2, BooleanClause.Occur.SHOULD); + + checkHits(bq, new int[] { 0, 2, 3 }); + } + + // or(w2 within/2 w1, w10 within/3 w1) + public void testUnorderedNearNearQuery() throws IOException { + Query near1 = new UnorderedNearQuery(2, makeTermQuery("w2"), makeTermQuery("w1")); + Query near2 = new UnorderedNearQuery(3, makeTermQuery("w10"), makeTermQuery("w1")); + BooleanQuery bq = new BooleanQuery(); + bq.add(near1, BooleanClause.Occur.SHOULD); + bq.add(near2, BooleanClause.Occur.SHOULD); + + checkHits(bq, new int[] { 0, 2, 3 }); + } + + // (a pre/2 b) pre/6 (c pre/2 d) + public void testNearNearNearQuery() throws IOException { + Query near1 = new OrderedNearQuery(2, makeTermQuery("w1"), makeTermQuery("w4")); + Query near2 = new OrderedNearQuery(2, makeTermQuery("w10"), makeTermQuery("w12")); + Query near3 = new OrderedNearQuery(6, near1, near2); + checkHits(near3, new int[] { 0 }); + } +} diff --git a/lucene/core/src/test/org/apache/lucene/search/intervals/TestOrderedConjunctionIntervalIterator.java b/lucene/core/src/test/org/apache/lucene/search/intervals/TestOrderedConjunctionIntervalIterator.java new file mode 100644 index 0000000..5a7a45c --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/search/intervals/TestOrderedConjunctionIntervalIterator.java @@ -0,0 +1,186 @@ +package org.apache.lucene.search.intervals; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.TextField; +import org.apache.lucene.index.*; +import org.apache.lucene.search.*; +import org.apache.lucene.search.BooleanClause.Occur; +import org.apache.lucene.search.Weight.PostingFeatures; +import org.apache.lucene.search.intervals.Interval; +import org.apache.lucene.search.intervals.IntervalFilter; +import org.apache.lucene.search.intervals.IntervalFilterQuery; +import org.apache.lucene.search.intervals.IntervalIterator; +import org.apache.lucene.search.intervals.OrderedConjunctionIntervalIterator; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.LuceneTestCase; + +import java.io.IOException; +import java.util.List; + +public class TestOrderedConjunctionIntervalIterator extends LuceneTestCase { + + private static final void addDocs(RandomIndexWriter writer) throws CorruptIndexException, IOException { + { + Document doc = new Document(); + doc.add(newField( + "field", + "Pease porridge hot! Pease porridge cold! Pease porridge in the pot nine days old! Some like it hot, some" + + " like it cold, Some like it in the pot nine days old! Pease porridge hot! Pease porridge cold!", + TextField.TYPE_STORED)); + writer.addDocument(doc); + } + + { + Document doc = new Document(); + doc.add(newField( + "field", + "Pease porridge cold! Pease porridge hot! Pease porridge in the pot nine days old! Some like it cold, some" + + " like it hot, Some like it in the pot nine days old! Pease porridge cold! Pease porridge hot!", + TextField.TYPE_STORED)); + writer.addDocument(doc); + } + + { + Document doc = new Document(); + doc.add(newField( + "field", + "Pease porridge cold! Pease porridge hot! Pease porridge in the pot nine days old!", + TextField.TYPE_STORED)); + writer.addDocument(doc); + } + } + public void testConjunctionPositionsBooleanQuery() throws IOException { + Directory directory = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random(), directory, + newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()))); + addDocs(writer); + + IndexReader reader = writer.getReader(); + IndexSearcher searcher = new IndexSearcher(reader); + writer.close(); + BooleanQuery query = new BooleanQuery(); + query.add(new BooleanClause(new TermQuery(new Term("field", "hot!")), Occur.MUST)); + query.add(new BooleanClause(new TermQuery(new Term("field", "pease")), Occur.MUST)); + query.add(new BooleanClause(new TermQuery(new Term("field", "porridge")), Occur.MUST)); + { + IntervalFilterQuery filter = new IntervalFilterQuery(query, new OrderedConjunctionPositionIteratorFilter()); + TopDocs search = searcher.search(filter, 10); + ScoreDoc[] scoreDocs = search.scoreDocs; + assertEquals(3, search.totalHits); + assertEquals(2, scoreDocs[0].doc); + assertEquals(0, scoreDocs[1].doc); + assertEquals(1, scoreDocs[2].doc); + } + + query = new BooleanQuery(); + query.add(new BooleanClause(new TermQuery(new Term("field", "old!")), Occur.MUST)); + query.add(new BooleanClause(new TermQuery(new Term("field", "pease")), Occur.MUST)); + query.add(new BooleanClause(new TermQuery(new Term("field", "porridge")), Occur.MUST)); + query.add(new BooleanClause(new TermQuery(new Term("field", "cold!")), Occur.MUST)); + + { + IntervalFilterQuery filter = new IntervalFilterQuery(query, new OrderedConjunctionPositionIteratorFilter()); + TopDocs search = searcher.search(filter, 10); + ScoreDoc[] scoreDocs = search.scoreDocs; + assertEquals(2, search.totalHits); + assertEquals(0, scoreDocs[0].doc); + assertEquals(1, scoreDocs[1].doc); + } + + reader.close(); + directory.close(); + } + + public void testConjuctionPositionIterator() throws IOException { + Directory directory = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random(), directory, + newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()))); + addDocs(writer); + + IndexReader reader = writer.getReader(); + writer.forceMerge(1); + IndexSearcher searcher = new IndexSearcher(reader); + writer.close(); + BooleanQuery query = new BooleanQuery(); + query.add(new BooleanClause(new TermQuery(new Term("field", "pease")), Occur.MUST)); + query.add(new BooleanClause(new TermQuery(new Term("field", "porridge")), Occur.MUST)); + query.add(new BooleanClause(new TermQuery(new Term("field", "hot!")), Occur.MUST)); + Weight weight = query.createWeight(searcher); + IndexReaderContext topReaderContext = searcher.getTopReaderContext(); + List leaves = topReaderContext.leaves(); + assertEquals(1, leaves.size()); + for (AtomicReaderContext atomicReaderContext : leaves) { + Scorer scorer = weight.scorer(atomicReaderContext, true, true, PostingFeatures.POSITIONS, atomicReaderContext.reader() + .getLiveDocs()); + { + int nextDoc = scorer.nextDoc(); + assertEquals(0, nextDoc); + IntervalIterator positions = new OrderedConjunctionIntervalIterator(false, scorer.intervals(false)); + assertEquals(0, positions.scorerAdvanced(nextDoc)); + Interval interval = null; + int[] start = new int[] {0, 31}; + int[] end = new int[] {2, 33}; + // {start}term{end} - end is pos+1 + // {0}Pease porridge hot!{0} Pease porridge cold! Pease porridge in the pot nine days old! Some like it hot, some" + // like it cold, Some like it in the pot nine days old! {1}Pease porridge hot!{1} Pease porridge cold!", + + for (int j = 0; j < end.length; j++) { + interval = positions.next(); + + assertNotNull(interval); + assertEquals(start[j], interval.begin); + assertEquals(end[j], interval.end); + } + assertNull(positions.next()); + } + { + int nextDoc = scorer.nextDoc(); + assertEquals(1, nextDoc); + IntervalIterator positions = new OrderedConjunctionIntervalIterator(false, scorer.intervals(false)); + assertEquals(1, positions.scorerAdvanced(nextDoc)); + Interval interval = null; + int[] start = new int[] {3, 34}; + int[] end = new int[] {5, 36}; + // {start}term{end} - end is pos+1 + // Pease porridge cold! {0}Pease porridge hot!{0} Pease porridge in the pot nine days old! Some like it cold, some + // like it hot, Some like it in the pot nine days old! Pease porridge cold! {1}Pease porridge hot{1}! + for (int j = 0; j < end.length; j++) { + interval = positions.next(); + assertNotNull(interval); + assertEquals(j + "", start[j], interval.begin); + assertEquals(j+ "", end[j], interval.end); + } + assertNull(positions.next()); + } + } + reader.close(); + directory.close(); + } + + public static class OrderedConjunctionPositionIteratorFilter implements IntervalFilter { + + @Override + public IntervalIterator filter(boolean collectIntervals, IntervalIterator iter) { + return new OrderedConjunctionIntervalIterator(collectIntervals, iter); + } + + } +} diff --git a/lucene/core/src/test/org/apache/lucene/search/intervals/TestPositionsAndOffsets.java b/lucene/core/src/test/org/apache/lucene/search/intervals/TestPositionsAndOffsets.java new file mode 100644 index 0000000..9b3db21 --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/search/intervals/TestPositionsAndOffsets.java @@ -0,0 +1,200 @@ +package org.apache.lucene.search.intervals; +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.codecs.Codec; +import org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat; +import org.apache.lucene.codecs.memory.MemoryPostingsFormat; +import org.apache.lucene.codecs.nestedpulsing.NestedPulsingPostingsFormat; +import org.apache.lucene.codecs.pulsing.Pulsing41PostingsFormat; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.FieldType; +import org.apache.lucene.document.TextField; +import org.apache.lucene.index.AtomicReaderContext; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexReaderContext; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.RandomIndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.MultiPhraseQuery; +import org.apache.lucene.search.PhraseQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.Scorer; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.Weight; +import org.apache.lucene.search.Weight.PostingFeatures; +import org.apache.lucene.search.intervals.Interval; +import org.apache.lucene.search.intervals.IntervalIterator; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util._TestUtil; + +import java.io.IOException; +import java.util.List; + +public class TestPositionsAndOffsets extends LuceneTestCase { + + // What am I testing here? + // - can get offsets out of a basic TermQuery, and a more complex BooleanQuery + // - if offsets are not stored, then we get -1 returned + + IndexWriterConfig iwc; + + public void setUp() throws Exception { + super.setUp(); + + // Currently only SimpleText and Lucene40 can index offsets into postings: + String codecName = Codec.getDefault().getName(); + assumeTrue("Codec does not support offsets: " + codecName, + codecName.equals("SimpleText") || + codecName.equals("Lucene40") || codecName.equals("Lucene41")); + + iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())); + + if (codecName.equals("Lucene40")) { + // Sep etc are not implemented + switch(random().nextInt(4)) { + case 0: iwc.setCodec(_TestUtil.alwaysPostingsFormat(new Lucene41PostingsFormat())); break; + case 1: iwc.setCodec(_TestUtil.alwaysPostingsFormat(new MemoryPostingsFormat())); break; + case 2: iwc.setCodec(_TestUtil.alwaysPostingsFormat( + new Pulsing41PostingsFormat(_TestUtil.nextInt(random(), 1, 3)))); break; + case 3: iwc.setCodec(_TestUtil.alwaysPostingsFormat(new NestedPulsingPostingsFormat())); break; + } + } + } + + + private static void addDocs(RandomIndexWriter writer, boolean withOffsets) throws IOException { + FieldType fieldType = TextField.TYPE_STORED; + if (withOffsets) { + fieldType = new FieldType(fieldType); + fieldType.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); + } + Document doc = new Document(); + doc.add(newField( + "field", + "Pease porridge hot! Pease porridge cold! Pease porridge in the pot nine days old! Some like it hot, some" + + " like it cold, Some like it in the pot nine days old! Pease porridge hot! Pease porridge cold!", + fieldType)); + writer.addDocument(doc); + } + + private void testQuery(Query query, int[][] expectedOffsets) throws IOException { + testQuery(query, expectedOffsets, true); + } + + private void testQuery(Query query, int[][] expectedOffsets, boolean needsOffsets) throws IOException { + Directory directory = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random(), directory, iwc); + addDocs(writer, needsOffsets); + + IndexReader reader = writer.getReader(); + IndexSearcher searcher = new IndexSearcher(reader); + writer.close(); + + Weight weight = query.createWeight(searcher); + IndexReaderContext topReaderContext = searcher.getTopReaderContext(); + List leaves = topReaderContext.leaves(); + assertEquals(1, leaves.size()); + Scorer scorer = weight.scorer(leaves.get(0), + true, true, PostingFeatures.OFFSETS, leaves.get(0).reader().getLiveDocs()); + + int nextDoc = scorer.nextDoc(); + assertEquals(0, nextDoc); + IntervalIterator positions = scorer.intervals(false); + int startOffsets[] = expectedOffsets[0]; + int endOffsets[] = expectedOffsets[1]; + + assertEquals(0, positions.scorerAdvanced(nextDoc)); + for (int i = 0; i < startOffsets.length; i++) { + Interval interval = positions.next(); + assertEquals("i: " + i, startOffsets[i], interval.offsetBegin); + assertEquals("i: " + i, endOffsets[i], interval.offsetEnd); + } + + assertNull(positions.next()); + + reader.close(); + directory.close(); + } + + public void testTermQueryWithoutOffsets() throws IOException { + Query query = new TermQuery(new Term("field", "porridge")); + int[] startOffsets = new int[] { 6, 26, 47, 164, 184 }; + int[] endOffsets = new int[] { 14, 34, 55, 172, 192 }; + testQuery(query, new int[][] { startOffsets, endOffsets }); + } + + public void testBooleanQueryWithOffsets() throws IOException { + + BooleanQuery query = new BooleanQuery(); + query.add(new BooleanClause(new TermQuery(new Term("field", "porridge")), + BooleanClause.Occur.MUST)); + query.add(new BooleanClause(new TermQuery(new Term("field", "nine")), + BooleanClause.Occur.MUST)); + int[] startOffsetsConj = new int[] {47,143}; + int[] endOffsetsConj = new int[] {71, 172}; + testQuery(query, new int[][] {startOffsetsConj, endOffsetsConj}); + } + + public void testExactPhraseQuery() throws IOException { + PhraseQuery query = new PhraseQuery(); + query.add(new Term("field", "pease")); + query.add(new Term("field", "porridge")); + query.add(new Term("field", "hot!")); + int[] startOffsetsBlock = new int[] {0, 158}; + int[] endOffsetsBlock = new int[] {19, 177}; + testQuery(query, new int[][] {startOffsetsBlock, endOffsetsBlock}); + } + + public void testSloppyPhraseQuery() throws IOException { + PhraseQuery query = new PhraseQuery(); + query.add(new Term("field", "pease")); + query.add(new Term("field", "hot!")); + query.setSlop(1); + int[] startOffsetsBlock = new int[] {0, 158}; + int[] endOffsetsBlock = new int[] {19, 177}; + testQuery(query, new int[][] {startOffsetsBlock, endOffsetsBlock}); + } + + public void testManyTermSloppyPhraseQuery() throws IOException { + PhraseQuery query = new PhraseQuery(); + query.add(new Term("field", "pease")); + query.add(new Term("field", "porridge")); + query.add(new Term("field", "pot")); + query.setSlop(2); + int[] startOffsetsBlock = new int[] {41}; + int[] endOffsetsBlock = new int[] {66}; + testQuery(query, new int[][] {startOffsetsBlock, endOffsetsBlock}); + } + + public void testMultiTermPhraseQuery() throws IOException { + MultiPhraseQuery query = new MultiPhraseQuery(); + query.add(new Term("field", "pease")); + query.add(new Term("field", "porridge")); + query + .add(new Term[] {new Term("field", "hot!"), new Term("field", "cold!")}); + int[] startOffsetsBlock = new int[] {0, 20, 158, 178}; + int[] endOffsetsBlock = new int[] {19, 40, 177, 198}; + testQuery(query, new int[][] {startOffsetsBlock, endOffsetsBlock}); + } +} \ No newline at end of file diff --git a/lucene/core/src/test/org/apache/lucene/search/spans/TestNearSpansOrdered.java b/lucene/core/src/test/org/apache/lucene/search/spans/TestNearSpansOrdered.java index 48a4b4f..e8ba500 100644 --- a/lucene/core/src/test/org/apache/lucene/search/spans/TestNearSpansOrdered.java +++ b/lucene/core/src/test/org/apache/lucene/search/spans/TestNearSpansOrdered.java @@ -30,6 +30,7 @@ import org.apache.lucene.search.Explanation; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Weight; import org.apache.lucene.search.Scorer; +import org.apache.lucene.search.Weight.PostingFeatures; import org.apache.lucene.store.Directory; import org.apache.lucene.util.LuceneTestCase; @@ -167,7 +168,7 @@ public class TestNearSpansOrdered extends LuceneTestCase { Weight w = searcher.createNormalizedWeight(q); IndexReaderContext topReaderContext = searcher.getTopReaderContext(); AtomicReaderContext leave = topReaderContext.leaves().get(0); - Scorer s = w.scorer(leave, true, false, leave.reader().getLiveDocs()); + Scorer s = w.scorer(leave, true, false, PostingFeatures.POSITIONS, leave.reader().getLiveDocs()); assertEquals(1, s.advance(1)); } diff --git a/lucene/core/src/test/org/apache/lucene/search/spans/TestSpans.java b/lucene/core/src/test/org/apache/lucene/search/spans/TestSpans.java index 61495de..9f6d2f8 100644 --- a/lucene/core/src/test/org/apache/lucene/search/spans/TestSpans.java +++ b/lucene/core/src/test/org/apache/lucene/search/spans/TestSpans.java @@ -38,6 +38,7 @@ import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.Scorer; import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.Weight.PostingFeatures; import org.apache.lucene.search.similarities.DefaultSimilarity; import org.apache.lucene.search.similarities.Similarity; import org.apache.lucene.store.Directory; @@ -428,7 +429,7 @@ public class TestSpans extends LuceneTestCase { slop, ordered); - spanScorer = searcher.createNormalizedWeight(snq).scorer(ctx, true, false, ctx.reader().getLiveDocs()); + spanScorer = searcher.createNormalizedWeight(snq).scorer(ctx, true, false, PostingFeatures.POSITIONS, ctx.reader().getLiveDocs()); } finally { searcher.setSimilarity(oldSim); } diff --git a/lucene/grouping/src/java/org/apache/lucene/search/grouping/BlockGroupingCollector.java b/lucene/grouping/src/java/org/apache/lucene/search/grouping/BlockGroupingCollector.java index 22e0b54..d1cffbb 100644 --- a/lucene/grouping/src/java/org/apache/lucene/search/grouping/BlockGroupingCollector.java +++ b/lucene/grouping/src/java/org/apache/lucene/search/grouping/BlockGroupingCollector.java @@ -18,14 +18,26 @@ package org.apache.lucene.search.grouping; */ +import java.io.IOException; + import org.apache.lucene.index.AtomicReaderContext; import org.apache.lucene.index.IndexWriter; -import org.apache.lucene.search.*; +import org.apache.lucene.search.Collector; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.search.FieldComparator; +import org.apache.lucene.search.Filter; +import org.apache.lucene.search.Scorer; +import org.apache.lucene.search.Sort; +import org.apache.lucene.search.SortField; +import org.apache.lucene.search.TopDocs; +import org.apache.lucene.search.TopDocsCollector; +import org.apache.lucene.search.TopFieldCollector; +import org.apache.lucene.search.TopScoreDocCollector; +import org.apache.lucene.search.Weight; +import org.apache.lucene.search.intervals.IntervalIterator; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.PriorityQueue; -import java.io.IOException; - // TODO: this sentence is too long for the class summary. /** BlockGroupingCollector performs grouping with a * single pass collector, as long as you are grouping by a @@ -117,6 +129,11 @@ public class BlockGroupingCollector extends Collector { public int nextDoc() { throw new UnsupportedOperationException(); } + + @Override + public IntervalIterator intervals(boolean collectIntervals) throws IOException { + throw new UnsupportedOperationException(); + } } private static final class OneGroup { diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/positions/ArrayIntervalIterator.java b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/positions/ArrayIntervalIterator.java new file mode 100644 index 0000000..61b435e --- /dev/null +++ b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/positions/ArrayIntervalIterator.java @@ -0,0 +1,69 @@ +package org.apache.lucene.search.highlight.positions; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.search.intervals.Interval; +import org.apache.lucene.search.intervals.IntervalCollector; +import org.apache.lucene.search.intervals.IntervalIterator; + +/** + * Present an array of PositionIntervals as an Iterator. + * @lucene.experimental + */ +public class ArrayIntervalIterator extends IntervalIterator { + + private int next = 0; + private int count; + private Interval[] positions; + + public ArrayIntervalIterator (Interval[] positions, int count) { + super(null, false); + this.positions = positions; + this.count = count; + } + + @Override + public Interval next() { + if (next >= count) + return null; + return positions[next++]; + } + + @Override + public IntervalIterator[] subs(boolean inOrder) { + return EMPTY; + } + + @Override + public void collect(IntervalCollector collector) { + assert collectIntervals; + } + + @Override + public int scorerAdvanced(int docId) throws IOException { + return 0; + } + + @Override + public int matchDistance() { + return 0; + } + +} \ No newline at end of file diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/positions/DocAndPositions.java b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/positions/DocAndPositions.java new file mode 100644 index 0000000..c9c3fb1 --- /dev/null +++ b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/positions/DocAndPositions.java @@ -0,0 +1,66 @@ +package org.apache.lucene.search.highlight.positions; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.Comparator; + +import org.apache.lucene.search.ScoreDoc; +import org.apache.lucene.search.intervals.Interval; +import org.apache.lucene.util.ArrayUtil; + +/** Used to accumulate position intervals while scoring + * @lucene.experimental + */ +public final class DocAndPositions extends ScoreDoc { + + public int posCount = 0; + public Interval[] positions; + + public DocAndPositions(int doc) { + super(doc, 0); + positions = new Interval[32]; + } + + public void storePosition (Interval pos) { + ensureStorage(); + positions[posCount++] = (Interval) pos.clone(); + } + + private void ensureStorage () { + if (posCount >= positions.length) { + Interval temp[] = new Interval[positions.length * 2]; + System.arraycopy(positions, 0, temp, 0, positions.length); + positions = temp; + } + } + + public Interval[] sortedPositions() { + ArrayUtil.mergeSort(positions, 0, posCount, new Comparator() { + public int compare(Interval o1, Interval o2) { + return + o1.begin < o2.begin ? -1 : + (o1.begin > o2.begin ? 1 : + (o1.end < o2.end ? -1 : + (o1.end > o2.end ? 1 : + 0))); + } + + }); + return positions; + } +} diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/positions/HighlightingIntervalCollector.java b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/positions/HighlightingIntervalCollector.java new file mode 100644 index 0000000..bb6da8d --- /dev/null +++ b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/positions/HighlightingIntervalCollector.java @@ -0,0 +1,109 @@ +package org.apache.lucene.search.highlight.positions; +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.index.AtomicReaderContext; +import org.apache.lucene.search.Collector; +import org.apache.lucene.search.Scorer; +import org.apache.lucene.search.Weight.PostingFeatures; +import org.apache.lucene.search.intervals.Interval; +import org.apache.lucene.search.intervals.IntervalCollector; +import org.apache.lucene.search.intervals.IntervalIterator; + +/** + * Collects the first maxDocs docs and their positions matching the query + * + * @lucene.experimental + */ + +public class HighlightingIntervalCollector extends Collector implements IntervalCollector { + + int count; + DocAndPositions docs[]; + + public HighlightingIntervalCollector (int maxDocs) { + docs = new DocAndPositions[maxDocs]; + } + + protected Scorer scorer; + private IntervalIterator positions; + + @Override + public void collect(int doc) throws IOException { + if (count >= docs.length) + return; + addDoc (doc); + // consume any remaining positions the scorer didn't report + docs[count-1].score=scorer.score(); + positions.scorerAdvanced(doc); + while(positions.next() != null) { + positions.collect(this); + } + } + + private boolean addDoc (int doc) { + if (count <= 0 || docs[count-1].doc != doc) { + DocAndPositions spdoc = new DocAndPositions (doc); + docs[count++] = spdoc; + return true; + } + return false; + } + + public boolean acceptsDocsOutOfOrder() { + return false; + } + + public void setScorer(Scorer scorer) throws IOException { + this.scorer = scorer; + positions = scorer.intervals(true); + // If we want to visit the other scorers, we can, here... + } + + public Scorer getScorer () { + return scorer; + } + + public DocAndPositions[] getDocs () { + DocAndPositions ret[] = new DocAndPositions[count]; + System.arraycopy(docs, 0, ret, 0, count); + return ret; + } + + public void setNextReader(AtomicReaderContext context) throws IOException { + } + + @Override + public PostingFeatures postingFeatures() { + return PostingFeatures.OFFSETS; + } + + @Override + public void collectLeafPosition(Scorer scorer, Interval interval, + int docID) { + addDoc(docID); + docs[count - 1].storePosition(interval); + } + + @Override + public void collectComposite(Scorer scorer, Interval interval, + int docID) { + } + +} diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/positions/IntervalTokenStream.java b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/positions/IntervalTokenStream.java new file mode 100644 index 0000000..c5ee2ff --- /dev/null +++ b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/positions/IntervalTokenStream.java @@ -0,0 +1,74 @@ +package org.apache.lucene.search.highlight.positions; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.search.intervals.Interval; +import org.apache.lucene.search.intervals.IntervalIterator; + +/** + * A TokenStream constructed from a stream of positions and their offsets. + * The document is segmented into tokens at the start and end offset of each interval. The intervals + * are assumed to be non-overlapping. + * + * TODO: abstract the dependency on the current PositionOffsetMapper impl; + * allow for implementations of position->offset maps that don't rely on term vectors. + * + * @lucene.experimental + */ +public class IntervalTokenStream extends TokenStream { + + //this tokenizer generates four attributes: + // term, offset, positionIncrement? and type? + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); + private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); + //private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class); + private final String text; + private final IntervalIterator positions; + + // the index of the current position interval + private Interval pos = null; + + public IntervalTokenStream (String text, IntervalIterator positions) { + this.text = text; + this.positions = positions; + } + + @Override + public final boolean incrementToken() throws IOException { + pos = positions.next(); + if (pos == null){ + return false; + } + int b, e; + b = pos.offsetBegin; + e = pos.offsetEnd; + assert b >=0; + termAtt.append(text, b, e); + offsetAtt.setOffset(b, e); + posIncrAtt.setPositionIncrement(1); + return true; + } + +} diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/highlight/positions/IntervalHighlighterTest.java b/lucene/highlighter/src/test/org/apache/lucene/search/highlight/positions/IntervalHighlighterTest.java new file mode 100644 index 0000000..f9580fc --- /dev/null +++ b/lucene/highlighter/src/test/org/apache/lucene/search/highlight/positions/IntervalHighlighterTest.java @@ -0,0 +1,547 @@ +package org.apache.lucene.search.highlight.positions; +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +import java.io.IOException; +import java.io.StringReader; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.analysis.MockTokenFilter; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.codecs.Codec; +import org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat; +import org.apache.lucene.codecs.memory.MemoryPostingsFormat; +import org.apache.lucene.codecs.nestedpulsing.NestedPulsingPostingsFormat; +import org.apache.lucene.codecs.pulsing.Pulsing41PostingsFormat; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.FieldType; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.FieldInfo.IndexOptions; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.IndexWriterConfig.OpenMode; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanClause.Occur; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.ConstantScoreQuery; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.MultiPhraseQuery; +import org.apache.lucene.search.MultiTermQuery; +import org.apache.lucene.search.PhraseQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.WildcardQuery; +import org.apache.lucene.search.highlight.Highlighter; +import org.apache.lucene.search.highlight.InvalidTokenOffsetsException; +import org.apache.lucene.search.highlight.SimpleFragmenter; +import org.apache.lucene.search.highlight.TextFragment; +import org.apache.lucene.search.intervals.BlockIntervalIterator; +import org.apache.lucene.search.intervals.IntervalFilter; +import org.apache.lucene.search.intervals.IntervalFilterQuery; +import org.apache.lucene.search.intervals.IntervalIterator; +import org.apache.lucene.search.intervals.NonOverlappingQuery; +import org.apache.lucene.search.intervals.OrderedNearQuery; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util._TestUtil; +import org.junit.Ignore; + +/** + * TODO: FIX THIS TEST Phrase and Span Queries positions callback API + */ +public class IntervalHighlighterTest extends LuceneTestCase { + + protected final static String F = "f"; + protected Analyzer analyzer; + protected Directory dir; + protected IndexSearcher searcher; + private IndexWriterConfig iwc; + + private static final String PORRIDGE_VERSE = "Pease porridge hot! Pease porridge cold! Pease porridge in the pot nine days old! Some like it hot, some" + + " like it cold, Some like it in the pot nine days old! Pease porridge hot! Pease porridge cold!"; + + public void setUp() throws Exception { + super.setUp(); + // Currently only SimpleText and Lucene40 can index offsets into postings: + String codecName = Codec.getDefault().getName(); + assumeTrue("Codec does not support offsets: " + codecName, + codecName.equals("SimpleText") || codecName.equals("Lucene40")); + iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false)).setOpenMode(OpenMode.CREATE); + + if (codecName.equals("Lucene40")) { + // Sep etc are not implemented + switch(random().nextInt(4)) { + case 0: iwc.setCodec(_TestUtil.alwaysPostingsFormat(new Lucene41PostingsFormat())); break; + case 1: iwc.setCodec(_TestUtil.alwaysPostingsFormat(new MemoryPostingsFormat())); break; + case 2: iwc.setCodec(_TestUtil.alwaysPostingsFormat( + new Pulsing41PostingsFormat(_TestUtil.nextInt(random(), 1, 3)))); break; + case 3: iwc.setCodec(_TestUtil.alwaysPostingsFormat(new NestedPulsingPostingsFormat())); break; + } + } + analyzer = iwc.getAnalyzer(); + dir = newDirectory(); + } + + public void close() throws IOException { + if (searcher != null) { + searcher.getIndexReader().close(); + searcher = null; + } + dir.close(); + } + + // make several docs + protected void insertDocs(Analyzer analyzer, String... values) + throws Exception { + IndexWriter writer = new IndexWriter(dir, iwc); + FieldType type = new FieldType(); + type.setIndexed(true); + type.setTokenized(true); + type.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); + type.setStored(true); + for (String value : values) { + Document doc = new Document(); + Field f = newField(F, value, type); + doc.add(f); + writer.addDocument(doc); + } + writer.close(); + if (searcher != null) { + searcher.getIndexReader().close(); + } + searcher = new IndexSearcher(DirectoryReader.open(dir)); + } + + protected static TermQuery termQuery(String term) { + return new TermQuery(new Term(F, term)); + } + + private String[] doSearch(Query q) throws IOException, + InvalidTokenOffsetsException { + return doSearch(q, 100); + } + + private class ConstantScorer implements + org.apache.lucene.search.highlight.Scorer { + + @Override + public TokenStream init(TokenStream tokenStream) throws IOException { + return tokenStream; + } + + @Override + public void startFragment(TextFragment newFragment) {} + + @Override + public float getTokenScore() { + return 1; + } + + @Override + public float getFragmentScore() { + return 1; + } + } + + private String getHighlight(Query q) throws IOException, InvalidTokenOffsetsException { + return doSearch(q, Integer.MAX_VALUE)[0]; + } + + private String[] doSearch(Query q, int maxFragSize) throws IOException, + InvalidTokenOffsetsException { + return doSearch(q, maxFragSize, 0); + } + private String[] doSearch(Query q, int maxFragSize, int docIndex) throws IOException, InvalidTokenOffsetsException { + return doSearch(q, maxFragSize, docIndex, false); + } + private String[] doSearch(Query q, int maxFragSize, int docIndex, boolean analyze) + throws IOException, InvalidTokenOffsetsException { + // ConstantScorer is a fragment Scorer, not a search result (document) + // Scorer + Highlighter highlighter = new Highlighter(new ConstantScorer()); + highlighter.setTextFragmenter(new SimpleFragmenter(maxFragSize)); + HighlightingIntervalCollector collector = new HighlightingIntervalCollector(10); + if (q instanceof MultiTermQuery) { + ((MultiTermQuery) q) + .setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE); + } + searcher.search(q, collector); + DocAndPositions doc = collector.docs[docIndex]; + if (doc == null) return null; + String text = searcher.getIndexReader().document(doc.doc).get(F); + // FIXME: test error cases: for non-stored fields, and fields w/no term + // vectors + // searcher.getIndexReader().getTermFreqVector(doc.doc, F, pom); + final TokenStream stream; + if (analyze) { + stream = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, + MockTokenFilter.EMPTY_STOPSET, true).tokenStream(F, + new StringReader(text)); + } else { + stream = new IntervalTokenStream(text, new ArrayIntervalIterator( + doc.sortedPositions(), doc.posCount)); + } + // + TextFragment[] fragTexts = highlighter.getBestTextFragments( + stream , text, false, 10); + String[] frags = new String[fragTexts.length]; + for (int i = 0; i < frags.length; i++) + frags[i] = fragTexts[i].toString(); + return frags; + } + + public void testTerm() throws Exception { + insertDocs(analyzer, "This is a test test"); + String frags[] = doSearch(termQuery("test")); + assertEquals("This is a test test", frags[0]); + close(); + } + + public void testSeveralSnippets() throws Exception { + String input = "this is some long text. It has the word long in many places. In fact, it has long on some different fragments. " + + "Let us see what happens to long in this case."; + String gold = "this is some long text. It has the word long in many places. In fact, it has long on some different fragments. " + + "Let us see what happens to long in this case."; + insertDocs(analyzer, input); + String frags[] = doSearch(termQuery("long"), input.length()); + assertEquals(gold, frags[0]); + close(); + } + + public void testBooleanAnd() throws Exception { + insertDocs(analyzer, "This is a test"); + BooleanQuery bq = new BooleanQuery(); + bq.add(new BooleanClause(termQuery("This"), Occur.MUST)); + bq.add(new BooleanClause(termQuery("test"), Occur.MUST)); + String frags[] = doSearch(bq); + assertEquals("This is a test", frags[0]); + close(); + } + + public void testConstantScore() throws Exception { + insertDocs(analyzer, "This is a test"); + BooleanQuery bq = new BooleanQuery(); + bq.add(new BooleanClause(termQuery("This"), Occur.MUST)); + bq.add(new BooleanClause(termQuery("test"), Occur.MUST)); + String frags[] = doSearch(new ConstantScoreQuery(bq)); + assertEquals("This is a test", frags[0]); + close(); + } + + public void testBooleanAndOtherOrder() throws Exception { + insertDocs(analyzer, "This is a test"); + BooleanQuery bq = new BooleanQuery(); + bq.add(new BooleanClause(new TermQuery(new Term(F, "test")), Occur.MUST)); + bq.add(new BooleanClause(new TermQuery(new Term(F, "This")), Occur.MUST)); + String frags[] = doSearch(bq); + assertEquals("This is a test", frags[0]); + close(); + } + + public void testBooleanOr() throws Exception { + insertDocs(analyzer, "This is a test"); + BooleanQuery bq = new BooleanQuery(); + bq.add(new BooleanClause(new TermQuery(new Term(F, "test")), Occur.SHOULD)); + bq.add(new BooleanClause(new TermQuery(new Term(F, "This")), Occur.SHOULD)); + String frags[] = doSearch(bq); + assertEquals("This is a test", frags[0]); + close(); + } + + public void testSingleMatchScorer() throws Exception { + insertDocs(analyzer, "This is a test"); + BooleanQuery bq = new BooleanQuery(); + bq.add(new BooleanClause(new TermQuery(new Term(F, "test")), Occur.SHOULD)); + bq.add(new BooleanClause(new TermQuery(new Term(F, "notoccurringterm")), + Occur.SHOULD)); + String frags[] = doSearch(bq); + assertEquals("This is a test", frags[0]); + close(); + } + + public void testBooleanNrShouldMatch() throws Exception { + insertDocs(analyzer, "a b c d e f g h i"); + BooleanQuery bq = new BooleanQuery(); + bq.add(new BooleanClause(new TermQuery(new Term(F, "a")), Occur.SHOULD)); + bq.add(new BooleanClause(new TermQuery(new Term(F, "b")), Occur.SHOULD)); + bq.add(new BooleanClause(new TermQuery(new Term(F, "no")), Occur.SHOULD)); + + // This generates a ConjunctionSumScorer + bq.setMinimumNumberShouldMatch(2); + String frags[] = doSearch(bq); + assertEquals("a b c d e f g h i", frags[0]); + + // This generates no scorer + bq.setMinimumNumberShouldMatch(3); + frags = doSearch(bq); + assertNull(frags); + + // This generates a DisjunctionSumScorer + bq.setMinimumNumberShouldMatch(2); + bq.add(new BooleanClause(new TermQuery(new Term(F, "c")), Occur.SHOULD)); + frags = doSearch(bq); + assertEquals("a b c d e f g h i", frags[0]); + close(); + } + + public void testPhrase() throws Exception { + insertDocs(analyzer, "is it that this is a test, is it"); + BooleanQuery bq = new BooleanQuery(); + bq.add(new BooleanClause(new TermQuery(new Term(F, "is")), Occur.MUST)); + bq.add(new BooleanClause(new TermQuery(new Term(F, "a")), Occur.MUST)); + IntervalFilterQuery pfq = new IntervalFilterQuery(bq, + new BlockPositionIteratorFilter()); + String frags[] = doSearch(pfq); + // make sure we highlight the phrase, and not the terms outside the phrase + assertEquals("is it that this is a test, is it", frags[0]); + close(); + } + + /* + * Failing ... PhraseQuery scorer needs positions()? + */ + //@Ignore + public void testPhraseOriginal() throws Exception { + insertDocs(analyzer, "This is a test"); + PhraseQuery pq = new PhraseQuery(); + pq.add(new Term(F, "a")); + pq.add(new Term(F, "test")); + String frags[] = doSearch(pq); + assertEquals("This is a test", frags[0]); + close(); + } + + public void testNestedBoolean() throws Exception { + insertDocs(analyzer, "This is a test"); + BooleanQuery bq = new BooleanQuery(); + bq.add(new BooleanClause(new TermQuery(new Term(F, "test")), Occur.SHOULD)); + BooleanQuery bq2 = new BooleanQuery(); + bq2.add(new BooleanClause(new TermQuery(new Term(F, "This")), Occur.SHOULD)); + bq2.add(new BooleanClause(new TermQuery(new Term(F, "is")), Occur.SHOULD)); + bq.add(new BooleanClause(bq2, Occur.SHOULD)); + String frags[] = doSearch(bq); + assertEquals("This is a test", frags[0]); + close(); + } + + public void testWildcard() throws Exception { + insertDocs(analyzer, "This is a test"); + String frags[] = doSearch(new WildcardQuery(new Term(F, "t*t"))); + assertEquals("This is a test", frags[0]); + close(); + } + + public void testMixedBooleanNot() throws Exception { + insertDocs(analyzer, "this is a test", "that is an elephant"); + BooleanQuery bq = new BooleanQuery(); + bq.add(new BooleanClause(new TermQuery(new Term(F, "test")), Occur.MUST)); + bq.add(new BooleanClause(new TermQuery(new Term(F, "that")), Occur.MUST_NOT)); + String frags[] = doSearch(bq); + assertEquals("this is a test", frags[0]); + close(); + } + + public void testMixedBooleanShould() throws Exception { + insertDocs(analyzer, "this is a test", "that is an elephant", "the other was a rhinoceros"); + BooleanQuery bq = new BooleanQuery(); + bq.add(new BooleanClause(new TermQuery(new Term(F, "is")), Occur.MUST)); + bq.add(new BooleanClause(new TermQuery(new Term(F, "test")), Occur.SHOULD)); + String frags[] = doSearch(bq, 50, 0); + assertEquals("this is a test", frags[0]); + frags = doSearch(bq, 50, 1); + assertEquals("that is an elephant", frags[0]); + + bq.add(new BooleanClause(new TermQuery(new Term(F, "rhinoceros")), Occur.SHOULD)); + frags = doSearch(bq, 50, 0); + assertEquals("this is a test", frags[0]); + frags = doSearch(bq, 50, 1); + assertEquals("that is an elephant", frags[0]); + close(); + } + + public void testMultipleDocumentsAnd() throws Exception { + insertDocs(analyzer, "This document has no matches", PORRIDGE_VERSE, + "This document has some Pease porridge in it"); + BooleanQuery bq = new BooleanQuery(); + bq.add(new BooleanClause(new TermQuery(new Term(F, "Pease")), Occur.MUST)); + bq.add(new BooleanClause(new TermQuery(new Term(F, "porridge")), Occur.MUST)); + String frags[] = doSearch(bq, 50, 0); + assertEquals( + "Pease porridge hot! Pease porridge cold! Pease", + frags[0]); + frags = doSearch(bq, 50, 1); + assertEquals("This document has some Pease porridge in it", + frags[0]); + close(); + } + + + public void testMultipleDocumentsOr() throws Exception { + insertDocs(analyzer, "This document has no matches", PORRIDGE_VERSE, + "This document has some Pease porridge in it"); + BooleanQuery bq = new BooleanQuery(); + bq.add(new BooleanClause(new TermQuery(new Term(F, "Pease")), Occur.SHOULD)); + bq.add(new BooleanClause(new TermQuery(new Term(F, "porridge")), + Occur.SHOULD)); + String frags[] = doSearch(bq, 50, 0); + assertEquals( + "Pease porridge hot! Pease porridge cold! Pease", + frags[0]); + frags = doSearch(bq, 50, 1); + assertEquals("This document has some Pease porridge in it", + frags[0]); + close(); + } + + public void testBrouwerianQuery() throws Exception { + + insertDocs(analyzer, "the quick brown duck jumps over the lazy dog with the quick brown fox"); + + BooleanQuery query = new BooleanQuery(); + query.add(new BooleanClause(new TermQuery(new Term(F, "the")), Occur.MUST)); + query.add(new BooleanClause(new TermQuery(new Term(F, "quick")), Occur.MUST)); + query.add(new BooleanClause(new TermQuery(new Term(F, "jumps")), Occur.MUST)); + + assertEquals(getHighlight(query), + "the quick brown duck jumps over the lazy dog with the quick brown fox"); + + BooleanQuery sub = new BooleanQuery(); + sub.add(new BooleanClause(new TermQuery(new Term(F, "duck")), Occur.MUST)); + NonOverlappingQuery bq = new NonOverlappingQuery(query, sub); + + assertEquals(getHighlight(bq), + "the quick brown duck jumps over the lazy dog with the quick brown fox"); + + close(); + } + + @Ignore("not implemented yet - unsupported") + public void testMultiPhraseQuery() throws Exception { + MultiPhraseQuery query = new MultiPhraseQuery(); + insertDocs(analyzer, "pease porridge hot but not too hot or otherwise pease porridge cold"); + + query.add(terms(F, "pease"), 0); + query.add(terms(F, "porridge"), 1); + query.add(terms(F, "hot", "cold"), 2); + query.setSlop(1); + + String[] frags = doSearch(query, Integer.MAX_VALUE); + assertEquals("pease porridge hot but not too hot or otherwise pease porridge cold", frags[0]); + + close(); + } + + @Ignore("not implemented yet - unsupported") + public void testMultiPhraseQueryCollisions() throws Exception { + MultiPhraseQuery query = new MultiPhraseQuery(); + insertDocs(analyzer, "pease porridge hot not too hot or otherwise pease porridge porridge"); + + query.add(terms(F, "pease"), 0); + query.add(terms(F, "porridge"), 1); + query.add(terms(F, "coldasice", "porridge" ), 2); + query.setSlop(1); + + String[] frags = doSearch(query, Integer.MAX_VALUE); + assertEquals("pease porridge hot but not too hot or otherwise pease porridge porridge", frags[0]); + + close(); + } + + public void testNearPhraseQuery() throws Exception { + + insertDocs(analyzer, "pease porridge rather hot and pease porridge fairly cold"); + + Query firstQ = new OrderedNearQuery(4, termQuery("pease"), termQuery("porridge"), termQuery("hot")); + { + String frags[] = doSearch(firstQ, Integer.MAX_VALUE); + assertEquals("pease porridge rather hot and pease porridge fairly cold", frags[0]); + } + + // near.3(near.4(pease, porridge, hot), near.4(pease, porridge, cold)) + Query q = new OrderedNearQuery(3, + firstQ, + new OrderedNearQuery(4, termQuery("pease"), termQuery("porridge"), termQuery("cold"))); + + String frags[] = doSearch(q, Integer.MAX_VALUE); + assertEquals("pease porridge rather hot and pease porridge fairly cold", + frags[0]); + + close(); + } + + private Term[] terms(String field, String...tokens) { + Term[] terms = new Term[tokens.length]; + for (int i = 0; i < tokens.length; i++) { + terms[i] = new Term(field, tokens[i]); + } + return terms; + } + + public void testSloppyPhraseQuery() throws Exception { + assertSloppyPhrase( "a b c d a b c d e f", "a b c d a b c d e f", 2, "c", "a"); + assertSloppyPhrase( "a c e b d e f a b","a c e b d e f a b", 2, "a", "b"); + assertSloppyPhrase( "Y A X B A", "Y A X B A", 2, "X", "A", "A"); + + assertSloppyPhrase( "X A X B A","X A X B A", 2, "X", "A", "A"); // non overlapping minmal!! + assertSloppyPhrase( "A A A X",null, 2, "X", "A", "A"); + assertSloppyPhrase( "A A X A", "A A X A", 2, "X", "A", "A"); + assertSloppyPhrase( "A A X A Y B A", "A A X A Y B A", 2, "X", "A", "A"); + assertSloppyPhrase( "A A X", null, 2, "X", "A", "A"); + assertSloppyPhrase( "A X A", null, 1, "X", "A", "A"); + + assertSloppyPhrase( "A X B A", "A X B A", 2, "X", "A", "A"); + assertSloppyPhrase( "A A X A X B A X B B A A X B A A", "A A X A X B A X B B A A X B A A", 2, "X", "A", "A"); + assertSloppyPhrase( "A A X A X B A X B B A A X B A A", "A A X A X B A X B B A A X B A A", 2, "X", "A", "A"); + + assertSloppyPhrase( "A A X A X B A", "A A X A X B A", 2, "X", "A", "A"); + assertSloppyPhrase( "A A Y A X B A", "A A Y A X B A", 2, "X", "A", "A"); + assertSloppyPhrase( "A A Y A X B A A", "A A Y A X B A A", 2, "X", "A", "A"); + assertSloppyPhrase( "A A X A Y B A", null , 1, "X", "A", "A"); + close(); + } + + + private void assertSloppyPhrase(String doc, String expected, int slop, String...query) throws Exception { + insertDocs(analyzer, doc); + PhraseQuery pq = new PhraseQuery(); + for (String string : query) { + pq.add(new Term(F, string)); + } + + pq.setSlop(slop); +// System.out.println(doc); + String[] frags = doSearch(pq, 50); + if (expected == null) { + assertNull(frags != null ? frags[0] : "", frags); + } else { + assertEquals(expected, frags[0]); + } + } + + public static class BlockPositionIteratorFilter implements IntervalFilter { + + @Override + public IntervalIterator filter(boolean collectIntervals, IntervalIterator iter) { + return new BlockIntervalIterator(collectIntervals, iter); + } + + } + +} diff --git a/lucene/join/src/java/org/apache/lucene/search/join/TermsIncludingScoreQuery.java b/lucene/join/src/java/org/apache/lucene/search/join/TermsIncludingScoreQuery.java index 368c018..629d64e 100644 --- a/lucene/join/src/java/org/apache/lucene/search/join/TermsIncludingScoreQuery.java +++ b/lucene/join/src/java/org/apache/lucene/search/join/TermsIncludingScoreQuery.java @@ -30,6 +30,7 @@ import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.Scorer; import org.apache.lucene.search.Weight; +import org.apache.lucene.search.intervals.IntervalIterator; import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefHash; @@ -99,7 +100,7 @@ class TermsIncludingScoreQuery extends Query { private TermsEnum segmentTermsEnum; public Explanation explain(AtomicReaderContext context, int doc) throws IOException { - SVInnerScorer scorer = (SVInnerScorer) scorer(context, false, false, context.reader().getLiveDocs()); + SVInnerScorer scorer = (SVInnerScorer) scorer(context, false, false, PostingFeatures.DOCS_AND_FREQS, context.reader().getLiveDocs()); if (scorer != null) { if (scorer.advanceForExplainOnly(doc) == doc) { return scorer.explain(); @@ -120,7 +121,7 @@ class TermsIncludingScoreQuery extends Query { originalWeight.normalize(norm, topLevelBoost * TermsIncludingScoreQuery.this.getBoost()); } - public Scorer scorer(AtomicReaderContext context, boolean scoreDocsInOrder, boolean topScorer, Bits acceptDocs) throws IOException { + public Scorer scorer(AtomicReaderContext context, boolean scoreDocsInOrder, boolean topScorer, PostingFeatures flagsß, Bits acceptDocs) throws IOException { Terms terms = context.reader().terms(field); if (terms == null) { return null; @@ -217,6 +218,10 @@ class TermsIncludingScoreQuery extends Query { } while (docId != DocIdSetIterator.NO_MORE_DOCS); return docId; } + @Override + public IntervalIterator intervals(boolean collectIntervals) throws IOException { + throw new UnsupportedOperationException(); + } @Override public float freq() { @@ -333,6 +338,12 @@ class TermsIncludingScoreQuery extends Query { public int advance(int target) throws IOException { return currentDoc = matchingDocsIterator.advance(target); } + + @Override + public IntervalIterator intervals(boolean collectIntervals) + throws IOException { + return null; + } } // This scorer deals with the fact that a document can have more than one score from multiple related documents. diff --git a/lucene/join/src/java/org/apache/lucene/search/join/ToChildBlockJoinQuery.java b/lucene/join/src/java/org/apache/lucene/search/join/ToChildBlockJoinQuery.java index 526101b..3898226 100644 --- a/lucene/join/src/java/org/apache/lucene/search/join/ToChildBlockJoinQuery.java +++ b/lucene/join/src/java/org/apache/lucene/search/join/ToChildBlockJoinQuery.java @@ -34,6 +34,7 @@ import org.apache.lucene.search.Query; import org.apache.lucene.search.Scorer; import org.apache.lucene.search.Scorer.ChildScorer; import org.apache.lucene.search.Weight; +import org.apache.lucene.search.intervals.IntervalIterator; import org.apache.lucene.util.Bits; import org.apache.lucene.util.FixedBitSet; @@ -121,10 +122,10 @@ public class ToChildBlockJoinQuery extends Query { // child document space @Override public Scorer scorer(AtomicReaderContext readerContext, boolean scoreDocsInOrder, - boolean topScorer, Bits acceptDocs) throws IOException { + boolean topScorer, PostingFeatures flags, Bits acceptDocs) throws IOException { // Pass scoreDocsInOrder true, topScorer false to our sub: - final Scorer parentScorer = parentWeight.scorer(readerContext, true, false, null); + final Scorer parentScorer = parentWeight.scorer(readerContext, true, false, flags, null); if (parentScorer == null) { // No matches @@ -301,6 +302,11 @@ public class ToChildBlockJoinQuery extends Query { } return childDoc; } + + @Override + public IntervalIterator intervals(boolean collectIntervals) throws IOException { + return parentScorer.intervals(collectIntervals); + } } @Override diff --git a/lucene/join/src/java/org/apache/lucene/search/join/ToParentBlockJoinCollector.java b/lucene/join/src/java/org/apache/lucene/search/join/ToParentBlockJoinCollector.java index 3a06310..2a7a569 100644 --- a/lucene/join/src/java/org/apache/lucene/search/join/ToParentBlockJoinCollector.java +++ b/lucene/join/src/java/org/apache/lucene/search/join/ToParentBlockJoinCollector.java @@ -23,6 +23,7 @@ import org.apache.lucene.search.*; import org.apache.lucene.search.Scorer.ChildScorer; import org.apache.lucene.search.grouping.GroupDocs; import org.apache.lucene.search.grouping.TopGroups; +import org.apache.lucene.search.intervals.IntervalIterator; import org.apache.lucene.util.ArrayUtil; import java.io.IOException; @@ -347,6 +348,11 @@ public class ToParentBlockJoinCollector extends Collector { public int nextDoc() { throw new UnsupportedOperationException(); } + + @Override + public IntervalIterator intervals(boolean collectIntervals) throws IOException { + throw new UnsupportedOperationException(); + } } private OneGroup[] sortedGroups; diff --git a/lucene/join/src/java/org/apache/lucene/search/join/ToParentBlockJoinQuery.java b/lucene/join/src/java/org/apache/lucene/search/join/ToParentBlockJoinQuery.java index 35e3ac6..5d7609a 100644 --- a/lucene/join/src/java/org/apache/lucene/search/join/ToParentBlockJoinQuery.java +++ b/lucene/join/src/java/org/apache/lucene/search/join/ToParentBlockJoinQuery.java @@ -36,7 +36,9 @@ import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.Scorer; import org.apache.lucene.search.Weight; +import org.apache.lucene.search.Weight.PostingFeatures; import org.apache.lucene.search.grouping.TopGroups; +import org.apache.lucene.search.intervals.IntervalIterator; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.Bits; import org.apache.lucene.util.FixedBitSet; @@ -158,10 +160,10 @@ public class ToParentBlockJoinQuery extends Query { // parent document space @Override public Scorer scorer(AtomicReaderContext readerContext, boolean scoreDocsInOrder, - boolean topScorer, Bits acceptDocs) throws IOException { + boolean topScorer, PostingFeatures flags, Bits acceptDocs) throws IOException { // Pass scoreDocsInOrder true, topScorer false to our sub: - final Scorer childScorer = childWeight.scorer(readerContext, true, false, null); + final Scorer childScorer = childWeight.scorer(readerContext, true, false, flags, null); if (childScorer == null) { // No matches @@ -195,7 +197,7 @@ public class ToParentBlockJoinQuery extends Query { @Override public Explanation explain(AtomicReaderContext context, int doc) throws IOException { - BlockJoinScorer scorer = (BlockJoinScorer) scorer(context, true, false, context.reader().getLiveDocs()); + BlockJoinScorer scorer = (BlockJoinScorer) scorer(context, true, false, PostingFeatures.DOCS_AND_FREQS, context.reader().getLiveDocs()); if (scorer != null) { if (scorer.advance(doc) == doc) { return scorer.explain(context.docBase); @@ -414,6 +416,11 @@ public class ToParentBlockJoinQuery extends Query { ); } + @Override + public IntervalIterator intervals(boolean collectIntervals) throws IOException { + throw new UnsupportedOperationException(); + } + } @Override diff --git a/lucene/join/src/test/org/apache/lucene/search/join/TestBlockJoin.java b/lucene/join/src/test/org/apache/lucene/search/join/TestBlockJoin.java index f2c98e7..ec30689 100644 --- a/lucene/join/src/test/org/apache/lucene/search/join/TestBlockJoin.java +++ b/lucene/join/src/test/org/apache/lucene/search/join/TestBlockJoin.java @@ -22,6 +22,7 @@ import org.apache.lucene.document.*; import org.apache.lucene.index.*; import org.apache.lucene.search.*; import org.apache.lucene.search.BooleanClause.Occur; +import org.apache.lucene.search.Weight.PostingFeatures; import org.apache.lucene.search.grouping.GroupDocs; import org.apache.lucene.search.grouping.TopGroups; import org.apache.lucene.store.Directory; @@ -962,7 +963,7 @@ public class TestBlockJoin extends LuceneTestCase { ToParentBlockJoinQuery q = new ToParentBlockJoinQuery(tq, parentFilter, ScoreMode.Avg); Weight weight = s.createNormalizedWeight(q); - DocIdSetIterator disi = weight.scorer(s.getIndexReader().leaves().get(0), true, true, null); + DocIdSetIterator disi = weight.scorer(s.getIndexReader().leaves().get(0), true, true, PostingFeatures.DOCS_AND_FREQS, null); assertEquals(1, disi.advance(1)); r.close(); dir.close(); @@ -996,7 +997,7 @@ public class TestBlockJoin extends LuceneTestCase { ToParentBlockJoinQuery q = new ToParentBlockJoinQuery(tq, parentFilter, ScoreMode.Avg); Weight weight = s.createNormalizedWeight(q); - DocIdSetIterator disi = weight.scorer(s.getIndexReader().leaves().get(0), true, true, null); + DocIdSetIterator disi = weight.scorer(s.getIndexReader().leaves().get(0), true, true, PostingFeatures.DOCS_AND_FREQS, null); assertEquals(2, disi.advance(0)); r.close(); dir.close(); diff --git a/lucene/queries/src/java/org/apache/lucene/queries/CustomScoreQuery.java b/lucene/queries/src/java/org/apache/lucene/queries/CustomScoreQuery.java index 7e7a250..fa23cba 100755 --- a/lucene/queries/src/java/org/apache/lucene/queries/CustomScoreQuery.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/CustomScoreQuery.java @@ -32,6 +32,7 @@ import org.apache.lucene.search.Query; import org.apache.lucene.search.Weight; import org.apache.lucene.search.Scorer; import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.intervals.IntervalIterator; import org.apache.lucene.util.Bits; import org.apache.lucene.util.ToStringUtils; @@ -228,19 +229,19 @@ public class CustomScoreQuery extends Query { @Override public Scorer scorer(AtomicReaderContext context, boolean scoreDocsInOrder, - boolean topScorer, Bits acceptDocs) throws IOException { + boolean topScorer, PostingFeatures flags, Bits acceptDocs) throws IOException { // Pass true for "scoresDocsInOrder", because we // require in-order scoring, even if caller does not, // since we call advance on the valSrcScorers. Pass // false for "topScorer" because we will not invoke // score(Collector) on these scorers: - Scorer subQueryScorer = subQueryWeight.scorer(context, true, false, acceptDocs); + Scorer subQueryScorer = subQueryWeight.scorer(context, true, false, flags, acceptDocs); if (subQueryScorer == null) { return null; } Scorer[] valSrcScorers = new Scorer[valSrcWeights.length]; for(int i = 0; i < valSrcScorers.length; i++) { - valSrcScorers[i] = valSrcWeights[i].scorer(context, true, topScorer, acceptDocs); + valSrcScorers[i] = valSrcWeights[i].scorer(context, true, topScorer, flags, acceptDocs); } return new CustomScorer(CustomScoreQuery.this.getCustomScoreProvider(context), this, getBoost(), subQueryScorer, valSrcScorers); } @@ -346,6 +347,11 @@ public class CustomScoreQuery extends Query { } return doc; } + + @Override + public IntervalIterator intervals(boolean collectIntervals) throws IOException { + return subQueryScorer.intervals(collectIntervals); + } } @Override diff --git a/lucene/queries/src/java/org/apache/lucene/queries/function/BoostedQuery.java b/lucene/queries/src/java/org/apache/lucene/queries/function/BoostedQuery.java index b175912..9793906 100755 --- a/lucene/queries/src/java/org/apache/lucene/queries/function/BoostedQuery.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/function/BoostedQuery.java @@ -18,6 +18,7 @@ package org.apache.lucene.queries.function; */ import org.apache.lucene.search.*; +import org.apache.lucene.search.intervals.IntervalIterator; import org.apache.lucene.index.AtomicReaderContext; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; @@ -98,9 +99,9 @@ public class BoostedQuery extends Query { @Override public Scorer scorer(AtomicReaderContext context, boolean scoreDocsInOrder, - boolean topScorer, Bits acceptDocs) throws IOException { + boolean topScorer, PostingFeatures flags, Bits acceptDocs) throws IOException { // we are gonna advance() the subscorer - Scorer subQueryScorer = qWeight.scorer(context, true, false, acceptDocs); + Scorer subQueryScorer = qWeight.scorer(context, true, false, flags, acceptDocs); if(subQueryScorer == null) { return null; } @@ -188,6 +189,11 @@ public class BoostedQuery extends Query { res.addDetail(vals.explain(doc)); return res; } + + @Override + public IntervalIterator intervals(boolean collectIntervals) throws IOException { + return scorer.intervals(collectIntervals); + } } diff --git a/lucene/queries/src/java/org/apache/lucene/queries/function/FunctionQuery.java b/lucene/queries/src/java/org/apache/lucene/queries/function/FunctionQuery.java index 558f8b8..a803a60 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/function/FunctionQuery.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/function/FunctionQuery.java @@ -21,6 +21,7 @@ import org.apache.lucene.index.AtomicReaderContext; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; import org.apache.lucene.search.*; +import org.apache.lucene.search.intervals.IntervalIterator; import org.apache.lucene.index.MultiFields; import org.apache.lucene.util.Bits; @@ -91,13 +92,13 @@ public class FunctionQuery extends Query { @Override public Scorer scorer(AtomicReaderContext context, boolean scoreDocsInOrder, - boolean topScorer, Bits acceptDocs) throws IOException { + boolean topScorer, PostingFeatures flags, Bits acceptDocs) throws IOException { return new AllScorer(context, acceptDocs, this, queryWeight); } @Override public Explanation explain(AtomicReaderContext context, int doc) throws IOException { - return ((AllScorer)scorer(context, true, true, context.reader().getLiveDocs())).explain(doc); + return ((AllScorer)scorer(context, true, true, PostingFeatures.DOCS_AND_FREQS, context.reader().getLiveDocs())).explain(doc); } } @@ -174,6 +175,11 @@ public class FunctionQuery extends Query { result.addDetail(new Explanation(weight.queryNorm,"queryNorm")); return result; } + + @Override + public IntervalIterator intervals(boolean collectIntervals) throws IOException { + throw new UnsupportedOperationException("AllScorer doesn't support interval iterators."); + } } diff --git a/lucene/queries/src/java/org/apache/lucene/queries/function/ValueSourceScorer.java b/lucene/queries/src/java/org/apache/lucene/queries/function/ValueSourceScorer.java index 6976a57..3439f2d 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/function/ValueSourceScorer.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/function/ValueSourceScorer.java @@ -20,6 +20,7 @@ package org.apache.lucene.queries.function; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.MultiFields; import org.apache.lucene.search.Scorer; +import org.apache.lucene.search.intervals.IntervalIterator; import org.apache.lucene.util.Bits; import java.io.IOException; @@ -88,6 +89,11 @@ public class ValueSourceScorer extends Scorer { } @Override + public IntervalIterator intervals(boolean collectIntervals) throws IOException { + throw new UnsupportedOperationException("ValueSourceScorer doesn't support interval iterators."); + } + + @Override public float freq() throws IOException { return 1; } diff --git a/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/QueryValueSource.java b/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/QueryValueSource.java index 3e22e8e..c0ebabf 100755 --- a/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/QueryValueSource.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/QueryValueSource.java @@ -23,6 +23,7 @@ import org.apache.lucene.queries.function.FunctionValues; import org.apache.lucene.queries.function.ValueSource; import org.apache.lucene.queries.function.docvalues.FloatDocValues; import org.apache.lucene.search.*; +import org.apache.lucene.search.Weight.PostingFeatures; import org.apache.lucene.util.Bits; import org.apache.lucene.util.mutable.MutableValue; import org.apache.lucene.util.mutable.MutableValueFloat; @@ -123,7 +124,7 @@ class QueryDocValues extends FloatDocValues { try { if (doc < lastDocRequested) { if (noMatches) return defVal; - scorer = weight.scorer(readerContext, true, false, acceptDocs); + scorer = weight.scorer(readerContext, true, false, PostingFeatures.DOCS_AND_FREQS, acceptDocs); if (scorer==null) { noMatches = true; return defVal; @@ -154,7 +155,7 @@ class QueryDocValues extends FloatDocValues { try { if (doc < lastDocRequested) { if (noMatches) return false; - scorer = weight.scorer(readerContext, true, false, acceptDocs); + scorer = weight.scorer(readerContext, true, false, PostingFeatures.DOCS_AND_FREQS, acceptDocs); scorerDoc = -1; if (scorer==null) { noMatches = true; @@ -212,7 +213,7 @@ class QueryDocValues extends FloatDocValues { mval.exists = false; return; } - scorer = weight.scorer(readerContext, true, false, acceptDocs); + scorer = weight.scorer(readerContext, true, false, PostingFeatures.DOCS_AND_FREQS, acceptDocs); scorerDoc = -1; if (scorer==null) { noMatches = true; diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/AssertingAtomicReader.java b/lucene/test-framework/src/java/org/apache/lucene/index/AssertingAtomicReader.java index 3e705fb..da4b780 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/index/AssertingAtomicReader.java +++ b/lucene/test-framework/src/java/org/apache/lucene/index/AssertingAtomicReader.java @@ -261,6 +261,9 @@ public class AssertingAtomicReader extends FilterAtomicReader { assert state != DocsEnumState.START : "freq() called before nextDoc()/advance()"; assert state != DocsEnumState.FINISHED : "freq() called after NO_MORE_DOCS"; int freq = super.freq(); + if (freq == 0) { + System.out.println(); + } assert freq > 0; return freq; } diff --git a/lucene/test-framework/src/java/org/apache/lucene/search/AssertingIndexSearcher.java b/lucene/test-framework/src/java/org/apache/lucene/search/AssertingIndexSearcher.java index 396358e..9c7fc04 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/search/AssertingIndexSearcher.java +++ b/lucene/test-framework/src/java/org/apache/lucene/search/AssertingIndexSearcher.java @@ -76,8 +76,8 @@ public class AssertingIndexSearcher extends IndexSearcher { @Override public Scorer scorer(AtomicReaderContext context, boolean scoreDocsInOrder, - boolean topScorer, Bits acceptDocs) throws IOException { - Scorer scorer = w.scorer(context, scoreDocsInOrder, topScorer, acceptDocs); + boolean topScorer, PostingFeatures flags, Bits acceptDocs) throws IOException { + Scorer scorer = w.scorer(context, scoreDocsInOrder, topScorer, flags, acceptDocs); if (scorer != null) { // check that scorer obeys disi contract for docID() before next()/advance try { diff --git a/lucene/test-framework/src/java/org/apache/lucene/search/QueryUtils.java b/lucene/test-framework/src/java/org/apache/lucene/search/QueryUtils.java index ff03f6e..e372686 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/search/QueryUtils.java +++ b/lucene/test-framework/src/java/org/apache/lucene/search/QueryUtils.java @@ -33,6 +33,7 @@ import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.MultiReader; import org.apache.lucene.index.SlowCompositeReaderWrapper; +import org.apache.lucene.search.Weight.PostingFeatures; import org.apache.lucene.store.Directory; import org.apache.lucene.store.MockDirectoryWrapper; import org.apache.lucene.store.RAMDirectory; @@ -280,7 +281,7 @@ public class QueryUtils { if (scorer == null) { Weight w = s.createNormalizedWeight(q); AtomicReaderContext context = readerContextArray.get(leafPtr); - scorer = w.scorer(context, true, false, context.reader().getLiveDocs()); + scorer = w.scorer(context, true, false, PostingFeatures.DOCS_AND_FREQS, context.reader().getLiveDocs()); } int op = order[(opidx[0]++) % order.length]; @@ -327,7 +328,7 @@ public class QueryUtils { indexSearcher.setSimilarity(s.getSimilarity()); Weight w = indexSearcher.createNormalizedWeight(q); AtomicReaderContext ctx = (AtomicReaderContext)indexSearcher.getTopReaderContext(); - Scorer scorer = w.scorer(ctx, true, false, ctx.reader().getLiveDocs()); + Scorer scorer = w.scorer(ctx, true, false, PostingFeatures.DOCS_AND_FREQS, ctx.reader().getLiveDocs()); if (scorer != null) { boolean more = scorer.advance(lastDoc[0] + 1) != DocIdSetIterator.NO_MORE_DOCS; Assert.assertFalse("query's last doc was "+ lastDoc[0] +" but skipTo("+(lastDoc[0]+1)+") got to "+scorer.docID(),more); @@ -354,7 +355,7 @@ public class QueryUtils { indexSearcher.setSimilarity(s.getSimilarity()); Weight w = indexSearcher.createNormalizedWeight(q); AtomicReaderContext ctx = previousReader.getContext(); - Scorer scorer = w.scorer(ctx, true, false, ctx.reader().getLiveDocs()); + Scorer scorer = w.scorer(ctx, true, false, PostingFeatures.DOCS_AND_FREQS, ctx.reader().getLiveDocs()); if (scorer != null) { boolean more = scorer.advance(lastDoc[0] + 1) != DocIdSetIterator.NO_MORE_DOCS; Assert.assertFalse("query's last doc was "+ lastDoc[0] +" but skipTo("+(lastDoc[0]+1)+") got to "+scorer.docID(),more); @@ -385,7 +386,7 @@ public class QueryUtils { long startMS = System.currentTimeMillis(); for (int i=lastDoc[0]+1; i<=doc; i++) { Weight w = s.createNormalizedWeight(q); - Scorer scorer = w.scorer(context.get(leafPtr), true, false, liveDocs); + Scorer scorer = w.scorer(context.get(leafPtr), true, false, PostingFeatures.DOCS_AND_FREQS, liveDocs); Assert.assertTrue("query collected "+doc+" but skipTo("+i+") says no more docs!",scorer.advance(i) != DocIdSetIterator.NO_MORE_DOCS); Assert.assertEquals("query collected "+doc+" but skipTo("+i+") got to "+scorer.docID(),doc,scorer.docID()); float skipToScore = scorer.score(); @@ -413,7 +414,7 @@ public class QueryUtils { IndexSearcher indexSearcher = LuceneTestCase.newSearcher(previousReader); indexSearcher.setSimilarity(s.getSimilarity()); Weight w = indexSearcher.createNormalizedWeight(q); - Scorer scorer = w.scorer((AtomicReaderContext)indexSearcher.getTopReaderContext(), true, false, previousReader.getLiveDocs()); + Scorer scorer = w.scorer((AtomicReaderContext)indexSearcher.getTopReaderContext(), true, false, PostingFeatures.DOCS_AND_FREQS, previousReader.getLiveDocs()); if (scorer != null) { boolean more = scorer.advance(lastDoc[0] + 1) != DocIdSetIterator.NO_MORE_DOCS; Assert.assertFalse("query's last doc was "+ lastDoc[0] +" but skipTo("+(lastDoc[0]+1)+") got to "+scorer.docID(),more); @@ -438,7 +439,7 @@ public class QueryUtils { IndexSearcher indexSearcher = LuceneTestCase.newSearcher(previousReader); indexSearcher.setSimilarity(s.getSimilarity()); Weight w = indexSearcher.createNormalizedWeight(q); - Scorer scorer = w.scorer((AtomicReaderContext)indexSearcher.getTopReaderContext(), true, false, previousReader.getLiveDocs()); + Scorer scorer = w.scorer((AtomicReaderContext)indexSearcher.getTopReaderContext(), true, false, PostingFeatures.DOCS_AND_FREQS, previousReader.getLiveDocs()); if (scorer != null) { boolean more = scorer.advance(lastDoc[0] + 1) != DocIdSetIterator.NO_MORE_DOCS; Assert.assertFalse("query's last doc was "+ lastDoc[0] +" but skipTo("+(lastDoc[0]+1)+") got to "+scorer.docID(),more); diff --git a/solr/build.xml b/solr/build.xml index 06c0ffe..5764f7b 100644 --- a/solr/build.xml +++ b/solr/build.xml @@ -26,7 +26,7 @@ - + @@ -131,48 +131,14 @@ depends="compile-core, compile-contrib"/> - - + depends="test-solr-core, test-solrj"/> - + depends="javadocs-all,javadocs-solrj,javadocs-test-framework"/> - - - - - - - - - - - - - - - - - - - - - - - - - @@ -186,27 +152,15 @@ - - - - - - - - - - - - - - - + + - - + + + @@ -226,10 +180,10 @@ - + @@ -271,15 +225,13 @@ - - - - - - - - - + + + + + + + @@ -368,10 +320,6 @@ - - + depends="init-dist, dist, example, javadocs"> @@ -437,8 +385,8 @@ - @@ -496,15 +444,15 @@ - - - - + + + + @@ -520,74 +468,182 @@ - - - - - - - - - - - - - - - + + + + - - - - - - - - - + + + + - - - - - - - - - + + + - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - - - + + + + - + - + - + - + + - + + + + + + + + + + + + + + + + + + + + + + + ... + + This task requires that the property 'stub.src.path' be set. + + It must contain a "path" listing directories containing source + files that this task should use when looking for classes that + need factories created, the format is platform specific -- + typically it is colon seperated in Unix, semi-colon seperated + on windows, ie: + + ant stub-factories -Dstub.src.path="core/src:../lucene/contrib:../lucene/core/src/java:../lucene/analysis" + + FYI: The file ${stub.list} contains a list of classes + that seem to need stub factories. (if java files can be found to + use as guides for creating them). + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -685,7 +741,17 @@ tofile="${analysis.conf.dest}/stopwords_tr.txt"/> - + + + + + + + + diff --git a/solr/core/src/java/org/apache/solr/schema/LatLonType.java b/solr/core/src/java/org/apache/solr/schema/LatLonType.java index eaa78cc..9b397be 100644 --- a/solr/core/src/java/org/apache/solr/schema/LatLonType.java +++ b/solr/core/src/java/org/apache/solr/schema/LatLonType.java @@ -24,6 +24,8 @@ import org.apache.lucene.queries.function.FunctionValues; import org.apache.lucene.queries.function.ValueSource; import org.apache.lucene.queries.function.valuesource.VectorValueSource; import org.apache.lucene.search.*; +import org.apache.lucene.search.intervals.IntervalIterator; + import com.spatial4j.core.io.ParseUtils; import com.spatial4j.core.context.SpatialContext; import com.spatial4j.core.distance.DistanceUtils; @@ -344,13 +346,13 @@ class SpatialDistanceQuery extends ExtendedQueryBase implements PostFilter { @Override public Scorer scorer(AtomicReaderContext context, boolean scoreDocsInOrder, - boolean topScorer, Bits acceptDocs) throws IOException { + boolean topScorer, PostingFeatures flags, Bits acceptDocs) throws IOException { return new SpatialScorer(context, acceptDocs, this, queryWeight); } @Override public Explanation explain(AtomicReaderContext context, int doc) throws IOException { - return ((SpatialScorer)scorer(context, true, true, context.reader().getLiveDocs())).explain(doc); + return ((SpatialScorer)scorer(context, true, true, PostingFeatures.DOCS_AND_FREQS, context.reader().getLiveDocs())).explain(doc); } } @@ -504,6 +506,11 @@ class SpatialDistanceQuery extends ExtendedQueryBase implements PostFilter { result.addDetail(new Explanation(weight.queryNorm,"queryNorm")); return result; } + + @Override + public IntervalIterator intervals(boolean collectIntervals) throws IOException { + throw new UnsupportedOperationException(); + } } @Override diff --git a/solr/core/src/java/org/apache/solr/search/JoinQParserPlugin.java b/solr/core/src/java/org/apache/solr/search/JoinQParserPlugin.java index 99de39b..8f9328c 100644 --- a/solr/core/src/java/org/apache/solr/search/JoinQParserPlugin.java +++ b/solr/core/src/java/org/apache/solr/search/JoinQParserPlugin.java @@ -19,6 +19,7 @@ package org.apache.solr.search; import org.apache.lucene.index.*; import org.apache.lucene.queryparser.classic.ParseException; import org.apache.lucene.search.*; +import org.apache.lucene.search.intervals.IntervalIterator; import org.apache.lucene.search.similarities.Similarity; import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; @@ -215,7 +216,7 @@ class JoinQuery extends Query { @Override public Scorer scorer(AtomicReaderContext context, boolean scoreDocsInOrder, - boolean topScorer, Bits acceptDocs) throws IOException { + boolean topScorer, PostingFeatures flags, Bits acceptDocs) throws IOException { if (filter == null) { boolean debug = rb != null && rb.isDebug(); long start = debug ? System.currentTimeMillis() : 0; @@ -484,7 +485,7 @@ class JoinQuery extends Query { @Override public Explanation explain(AtomicReaderContext context, int doc) throws IOException { - Scorer scorer = scorer(context, true, false, context.reader().getLiveDocs()); + Scorer scorer = scorer(context, true, false, PostingFeatures.DOCS_AND_FREQS, context.reader().getLiveDocs()); boolean exists = scorer.advance(doc) == doc; ComplexExplanation result = new ComplexExplanation(); @@ -542,6 +543,15 @@ class JoinQuery extends Query { public int advance(int target) throws IOException { return iter.advance(target); } + + @Override + public IntervalIterator intervals(boolean collectIntervals) throws IOException { + if (iter instanceof Scorer) { + return ((Scorer) iter).intervals(collectIntervals); + } + throw new UnsupportedOperationException("Positions are only supported for Scorers"); + + } } diff --git a/solr/core/src/java/org/apache/solr/search/SolrConstantScoreQuery.java b/solr/core/src/java/org/apache/solr/search/SolrConstantScoreQuery.java index 05a1315..b9dd1c2 100755 --- a/solr/core/src/java/org/apache/solr/search/SolrConstantScoreQuery.java +++ b/solr/core/src/java/org/apache/solr/search/SolrConstantScoreQuery.java @@ -2,6 +2,7 @@ package org.apache.solr.search; import org.apache.lucene.queries.function.ValueSource; import org.apache.lucene.search.*; +import org.apache.lucene.search.intervals.IntervalIterator; import org.apache.lucene.util.Bits; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.AtomicReaderContext; @@ -120,7 +121,7 @@ public class SolrConstantScoreQuery extends ConstantScoreQuery implements Extend @Override public Scorer scorer(AtomicReaderContext context, boolean scoreDocsInOrder, - boolean topScorer, Bits acceptDocs) throws IOException { + boolean topScorer, PostingFeatures flags, Bits acceptDocs) throws IOException { return new ConstantScorer(context, this, queryWeight, acceptDocs); } @@ -196,6 +197,14 @@ public class SolrConstantScoreQuery extends ConstantScoreQuery implements Extend public int advance(int target) throws IOException { return docIdSetIterator.advance(target); } + + @Override + public IntervalIterator intervals(boolean collectIntervals) throws IOException { + if (docIdSetIterator instanceof Scorer) { + return ((Scorer) docIdSetIterator).intervals(collectIntervals); + } + throw new UnsupportedOperationException("Positions are only supported for Scorers"); + } } @Override diff --git a/solr/core/src/java/org/apache/solr/search/SolrIndexSearcher.java b/solr/core/src/java/org/apache/solr/search/SolrIndexSearcher.java index 0116658..4d83f33 100644 --- a/solr/core/src/java/org/apache/solr/search/SolrIndexSearcher.java +++ b/solr/core/src/java/org/apache/solr/search/SolrIndexSearcher.java @@ -35,6 +35,7 @@ import org.apache.lucene.document.StoredField; import org.apache.lucene.document.TextField; import org.apache.lucene.index.*; import org.apache.lucene.search.*; +import org.apache.lucene.search.Weight.PostingFeatures; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.store.NRTCachingDirectory; @@ -2193,7 +2194,7 @@ class FilterImpl extends Filter { iterators.add(iter); } for (Weight w : weights) { - Scorer scorer = w.scorer(context, true, false, context.reader().getLiveDocs()); + Scorer scorer = w.scorer(context, true, false, PostingFeatures.DOCS_AND_FREQS, context.reader().getLiveDocs()); if (scorer == null) return null; iterators.add(scorer); }