diff --git a/lucene/core/src/java/org/apache/lucene/search/TermScorer.java b/lucene/core/src/java/org/apache/lucene/search/TermScorer.java index b315edf..16aa231 100644 --- a/lucene/core/src/java/org/apache/lucene/search/TermScorer.java +++ b/lucene/core/src/java/org/apache/lucene/search/TermScorer.java @@ -118,6 +118,8 @@ final class TermScorer extends Scorer { public PositionInterval next() throws IOException { if (--positionsPending >= 0) { interval.begin = interval.end = docsAndPos.nextPosition(); + interval.offsetBegin = docsAndPos.startOffset(); + interval.offsetEnd = docsAndPos.endOffset(); return interval; } positionsPending = 0; diff --git a/lucene/core/src/java/org/apache/lucene/search/positions/BlockPositionIterator.java b/lucene/core/src/java/org/apache/lucene/search/positions/BlockPositionIterator.java index 6a985a7..9f2c1d2 100644 --- a/lucene/core/src/java/org/apache/lucene/search/positions/BlockPositionIterator.java +++ b/lucene/core/src/java/org/apache/lucene/search/positions/BlockPositionIterator.java @@ -30,10 +30,10 @@ public final class BlockPositionIterator extends PositionIntervalIterator { private final PositionIntervalIterator[] iterators; private static final PositionInterval INFINITE_INTERVAL = new PositionInterval( - Integer.MIN_VALUE, Integer.MIN_VALUE); + Integer.MIN_VALUE, Integer.MIN_VALUE, -1, -1); private final PositionInterval[] intervals; private final PositionInterval interval = new PositionInterval( - Integer.MIN_VALUE, Integer.MIN_VALUE); + Integer.MIN_VALUE, Integer.MIN_VALUE, -1, -1); private final int[] gaps; private final int lastIter; @@ -111,6 +111,8 @@ public final class BlockPositionIterator extends PositionIntervalIterator { } interval.begin = intervals[0].begin; interval.end = intervals[lastIter].end; + interval.offsetBegin = intervals[0].offsetBegin; + interval.offsetEnd = intervals[lastIter].offsetEnd; return interval; } diff --git a/lucene/core/src/java/org/apache/lucene/search/positions/IntervalQueue.java b/lucene/core/src/java/org/apache/lucene/search/positions/IntervalQueue.java index 6018713..6f90a49 100644 --- a/lucene/core/src/java/org/apache/lucene/search/positions/IntervalQueue.java +++ b/lucene/core/src/java/org/apache/lucene/search/positions/IntervalQueue.java @@ -27,7 +27,7 @@ import org.apache.lucene.util.PriorityQueue; // nocommit - javadoc abstract class IntervalQueue extends PriorityQueue { final PositionInterval queueInterval = new PositionInterval( - Integer.MIN_VALUE, Integer.MIN_VALUE); + Integer.MIN_VALUE, Integer.MIN_VALUE, -1, -1); public void reset() { clear(); diff --git a/lucene/core/src/java/org/apache/lucene/search/positions/IntervalQueueAnd.java b/lucene/core/src/java/org/apache/lucene/search/positions/IntervalQueueAnd.java index dd2b5ef..366e798 100644 --- a/lucene/core/src/java/org/apache/lucene/search/positions/IntervalQueueAnd.java +++ b/lucene/core/src/java/org/apache/lucene/search/positions/IntervalQueueAnd.java @@ -24,6 +24,7 @@ import org.apache.lucene.search.positions.PositionIntervalIterator.PositionInter final class IntervalQueueAnd extends IntervalQueue { int rightExtreme = Integer.MIN_VALUE; + int rightExtremeOffset = Integer.MIN_VALUE; public IntervalQueueAnd(int size) { super(size); @@ -34,10 +35,12 @@ final class IntervalQueueAnd extends IntervalQueue { queueInterval.begin = Integer.MIN_VALUE; queueInterval.end = Integer.MIN_VALUE; rightExtreme = Integer.MIN_VALUE; + rightExtremeOffset = Integer.MIN_VALUE; } public void updateRightExtreme(PositionInterval interval) { - rightExtreme = Math.max(rightExtreme, Math.max(interval.end, interval.end)); + rightExtreme = Math.max(rightExtreme, interval.end); + rightExtremeOffset = Math.max(rightExtremeOffset, interval.offsetEnd); } public boolean topContainsQueueInterval() { @@ -49,7 +52,9 @@ final class IntervalQueueAnd extends IntervalQueue { public void updateQueueInterval() { PositionInterval interval = top().interval; queueInterval.begin = interval.begin; + queueInterval.offsetBegin = interval.offsetBegin; queueInterval.end = rightExtreme; + queueInterval.offsetEnd = rightExtremeOffset; } @Override diff --git a/lucene/core/src/java/org/apache/lucene/search/positions/IntervalQueueOr.java b/lucene/core/src/java/org/apache/lucene/search/positions/IntervalQueueOr.java index 6cb6d0f..a5a29f2 100644 --- a/lucene/core/src/java/org/apache/lucene/search/positions/IntervalQueueOr.java +++ b/lucene/core/src/java/org/apache/lucene/search/positions/IntervalQueueOr.java @@ -39,7 +39,9 @@ final class IntervalQueueOr extends IntervalQueue { public void updateQueueInterval() { PositionInterval interval = top().interval; queueInterval.begin = interval.begin; + queueInterval.offsetBegin = interval.offsetBegin; queueInterval.end = interval.end; + queueInterval.offsetEnd = interval.offsetEnd; } @Override diff --git a/lucene/core/src/java/org/apache/lucene/search/positions/OrderedConjunctionPositionIterator.java b/lucene/core/src/java/org/apache/lucene/search/positions/OrderedConjunctionPositionIterator.java index 7360221..84bd2c3 100644 --- a/lucene/core/src/java/org/apache/lucene/search/positions/OrderedConjunctionPositionIterator.java +++ b/lucene/core/src/java/org/apache/lucene/search/positions/OrderedConjunctionPositionIterator.java @@ -26,11 +26,11 @@ public final class OrderedConjunctionPositionIterator extends private final PositionIntervalIterator[] iterators; private static final PositionInterval INFINITE_INTERVAL = new PositionInterval( - Integer.MIN_VALUE, Integer.MIN_VALUE); + Integer.MIN_VALUE, Integer.MIN_VALUE, -1, -1); private final PositionInterval[] intervals; private final int lastIter; private final PositionInterval interval = new PositionInterval( - Integer.MAX_VALUE, Integer.MAX_VALUE); + Integer.MAX_VALUE, Integer.MAX_VALUE, -1, -1); private int index = 1; public OrderedConjunctionPositionIterator(PositionIntervalIterator other) { @@ -52,6 +52,8 @@ public final class OrderedConjunctionPositionIterator extends interval.begin = Integer.MAX_VALUE; interval.end = Integer.MAX_VALUE; + interval.offsetBegin = -1; + interval.offsetEnd = -1; int b = Integer.MAX_VALUE; while (true) { while (true) { @@ -74,6 +76,8 @@ public final class OrderedConjunctionPositionIterator extends } interval.begin = intervals[0].begin; interval.end = intervals[lastIter].end; + interval.offsetBegin = intervals[0].offsetBegin; + interval.offsetEnd = intervals[lastIter].offsetEnd; b = intervals[lastIter].begin; index = 1; intervals[0] = iterators[0].next(); diff --git a/lucene/core/src/java/org/apache/lucene/search/positions/PositionIntervalIterator.java b/lucene/core/src/java/org/apache/lucene/search/positions/PositionIntervalIterator.java index d2a502f..8687696 100644 --- a/lucene/core/src/java/org/apache/lucene/search/positions/PositionIntervalIterator.java +++ b/lucene/core/src/java/org/apache/lucene/search/positions/PositionIntervalIterator.java @@ -16,11 +16,11 @@ package org.apache.lucene.search.positions; * See the License for the specific language governing permissions and * limitations under the License. */ -import java.io.IOException; - import org.apache.lucene.search.Scorer; import org.apache.lucene.util.BytesRef; +import java.io.IOException; + /** * * @lucene.experimental @@ -89,14 +89,18 @@ public abstract class PositionIntervalIterator { public int begin; public int end; + public int offsetBegin; + public int offsetEnd; - public PositionInterval(int begin, int end) { + public PositionInterval(int begin, int end, int offsetBegin, int offsetEnd) { this.begin = begin; this.end = end; + this.offsetBegin = offsetBegin; + this.offsetEnd = offsetEnd; } public PositionInterval() { - this(0, 0); + this(0, 0, -1, -1); } public boolean nextPayload(BytesRef ref) throws IOException { @@ -122,7 +126,7 @@ public abstract class PositionIntervalIterator { @Override public String toString() { - return "PositionInterval [begin=" + begin + ", end=" + end + "]"; + return "PositionInterval [begin=" + begin + "(" + offsetBegin + "), end=" + end + "(" + offsetEnd + ")]"; } } diff --git a/lucene/core/src/test/org/apache/lucene/search/positions/TestPositionOffsets.java b/lucene/core/src/test/org/apache/lucene/search/positions/TestPositionOffsets.java index e69de29..bdcdb0b 100644 --- a/lucene/core/src/test/org/apache/lucene/search/positions/TestPositionOffsets.java +++ b/lucene/core/src/test/org/apache/lucene/search/positions/TestPositionOffsets.java @@ -0,0 +1,196 @@ +package org.apache.lucene.search.positions; + +import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.codecs.Codec; +import org.apache.lucene.codecs.lucene40.Lucene40PostingsFormat; +import org.apache.lucene.codecs.memory.MemoryPostingsFormat; +import org.apache.lucene.codecs.nestedpulsing.NestedPulsingPostingsFormat; +import org.apache.lucene.codecs.pulsing.Pulsing40PostingsFormat; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.FieldType; +import org.apache.lucene.document.TextField; +import org.apache.lucene.index.*; +import org.apache.lucene.search.*; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util._TestUtil; +import org.junit.Ignore; + +import java.io.IOException; + +/** + * Copyright (c) 2012 Lemur Consulting Ltd. + *

+ * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +public class TestPositionOffsets extends LuceneTestCase { + + // What am I testing here? + // - can get offsets out of a basic TermQuery, and a more complex BooleanQuery + // - if offsets are not stored, then we get -1 returned + + IndexWriterConfig iwc; + + public void setUp() throws Exception { + super.setUp(); + + // Currently only SimpleText and Lucene40 can index offsets into postings: + String codecName = Codec.getDefault().getName(); + assumeTrue("Codec does not support offsets: " + codecName, + codecName.equals("SimpleText") || + codecName.equals("Lucene40")); + + iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())); + + if (codecName.equals("Lucene40")) { + // Sep etc are not implemented + switch(random().nextInt(4)) { + case 0: iwc.setCodec(_TestUtil.alwaysPostingsFormat(new Lucene40PostingsFormat())); break; + case 1: iwc.setCodec(_TestUtil.alwaysPostingsFormat(new MemoryPostingsFormat())); break; + case 2: iwc.setCodec(_TestUtil.alwaysPostingsFormat( + new Pulsing40PostingsFormat(_TestUtil.nextInt(random(), 1, 3)))); break; + case 3: iwc.setCodec(_TestUtil.alwaysPostingsFormat(new NestedPulsingPostingsFormat())); break; + } + } + } + + + private static void addDocs(RandomIndexWriter writer, boolean withOffsets) throws IOException { + FieldType fieldType = TextField.TYPE_STORED; + if (withOffsets) { + fieldType = new FieldType(fieldType); + fieldType.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); + } + Document doc = new Document(); + doc.add(newField( + "field", + "Pease porridge hot! Pease porridge cold! Pease porridge in the pot nine days old! Some like it hot, some" + + " like it cold, Some like it in the pot nine days old! Pease porridge hot! Pease porridge cold!", + fieldType)); + writer.addDocument(doc); + } + + public void testTermQueryWithOffsets() throws IOException { + Directory directory = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random(), directory, iwc); + addDocs(writer, true); + + IndexReader reader = writer.getReader(); + IndexSearcher searcher = new IndexSearcher(reader); + writer.close(); + Query query = new TermQuery(new Term("field", "porridge")); + + Weight weight = query.createWeight(searcher); + IndexReaderContext topReaderContext = searcher.getTopReaderContext(); + AtomicReaderContext[] leaves = topReaderContext.leaves(); + assertEquals(1, leaves.length); + Scorer scorer = weight.scorer(leaves[0], + true, true, leaves[0].reader().getLiveDocs()); + + int nextDoc = scorer.nextDoc(); + assertEquals(0, nextDoc); + PositionIntervalIterator positions = scorer.positions(false, true); + int[] startOffsets = new int[] { 6, 26, 47, 164, 184 }; + int[] endOffsets = new int[] { 14, 34, 55, 172, 192 }; + + assertEquals(0, positions.advanceTo(nextDoc)); + for (int i = 0; i < startOffsets.length; i++) { + PositionIntervalIterator.PositionInterval interval = positions.next(); + assertEquals(startOffsets[i], interval.offsetBegin); + assertEquals(endOffsets[i], interval.offsetEnd); + } + + assertNull(positions.next()); + + reader.close(); + directory.close(); + } + + public void testTermQueryWithoutOffsets() throws IOException { + Directory directory = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random(), directory, iwc); + addDocs(writer, false); + + IndexReader reader = writer.getReader(); + IndexSearcher searcher = new IndexSearcher(reader); + writer.close(); + Query query = new TermQuery(new Term("field", "porridge")); + + Weight weight = query.createWeight(searcher); + IndexReaderContext topReaderContext = searcher.getTopReaderContext(); + AtomicReaderContext[] leaves = topReaderContext.leaves(); + assertEquals(1, leaves.length); + Scorer scorer = weight.scorer(leaves[0], + true, true, leaves[0].reader().getLiveDocs()); + + int nextDoc = scorer.nextDoc(); + assertEquals(0, nextDoc); + PositionIntervalIterator positions = scorer.positions(false, false); + int[] startOffsets = new int[] { -1, -1, -1, -1, -1 }; + int[] endOffsets = new int[] { -1, -1, -1, -1, -1 }; + + assertEquals(0, positions.advanceTo(nextDoc)); + for (int i = 0; i < startOffsets.length; i++) { + PositionIntervalIterator.PositionInterval interval = positions.next(); + assertEquals(startOffsets[i], interval.offsetBegin); + assertEquals(endOffsets[i], interval.offsetEnd); + } + + assertNull(positions.next()); + + reader.close(); + directory.close(); + } + + // This one doesn't do what I expect at the moment - a ConjunctionPositionIterator returns + // PositionIntervals that span over all the individual clauses, rather than returning each + // clause's match position as a separate interval. I don't think this is right...? + @Ignore + public void testBooleanQueryWithOffsets() throws IOException { + Directory directory = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random(), directory, iwc); + addDocs(writer, true); + + IndexReader reader = writer.getReader(); + IndexSearcher searcher = new IndexSearcher(reader); + writer.close(); + BooleanQuery query = new BooleanQuery(); + query.add(new BooleanClause(new TermQuery(new Term("field", "porridge")), BooleanClause.Occur.MUST)); + query.add(new BooleanClause(new TermQuery(new Term("field", "nine")), BooleanClause.Occur.MUST)); + + Weight weight = query.createWeight(searcher); + IndexReaderContext topReaderContext = searcher.getTopReaderContext(); + AtomicReaderContext[] leaves = topReaderContext.leaves(); + assertEquals(1, leaves.length); + Scorer scorer = weight.scorer(leaves[0], + true, true, leaves[0].reader().getLiveDocs()); + + int nextDoc = scorer.nextDoc(); + assertEquals(0, nextDoc); + PositionIntervalIterator positions = scorer.positions(false, true); + int[] startOffsets = new int[] { 6, 26, 47, 57, 150, 164, 184 }; + int[] endOffsets = new int[] { 14, 34, 55, 61, 154, 172, 192 }; + + assertEquals(0, positions.advanceTo(nextDoc)); + for (int i = 0; i < startOffsets.length; i++) { + PositionIntervalIterator.PositionInterval interval = positions.next(); + assertEquals(startOffsets[i], interval.offsetBegin); + assertEquals(endOffsets[i], interval.offsetEnd); + } + + assertNull(positions.next()); + + reader.close(); + directory.close(); + } +}