### Eclipse Workspace Patch 1.0
#P lucene
Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/poshighlight/PosScorer.java
===================================================================
--- lucene/contrib/highlighter/src/java/org/apache/lucene/search/poshighlight/PosScorer.java (revision 0)
+++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/poshighlight/PosScorer.java (revision 0)
@@ -0,0 +1,61 @@
+package org.apache.lucene.search.poshighlight;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.search.highlight.TextFragment;
+
+/**
+ * Simple highlighting scorer for testing; returns scores from the underlying
+ * TokenStream; the fragment's score is the total of its tokens' scores.
+ *
+ * A more elaborate scorer might give a higher score to fragments containing more distinct tokens.
+ *
+ * @lucene.experimental
+ */
+public class PosScorer implements org.apache.lucene.search.highlight.Scorer {
+ private ScoreAttribute scoreAtt;
+ private float fragmentScore;
+
+ @Override
+ public TokenStream init(TokenStream tokenStream) throws IOException {
+ scoreAtt = tokenStream.addAttribute(ScoreAttribute.class);
+ return tokenStream;
+ }
+
+ @Override
+ public void startFragment(TextFragment newFragment) {
+ fragmentScore = 0;
+ }
+
+ @Override
+ public float getTokenScore() {
+ if (scoreAtt.score() > 0) {
+ fragmentScore += scoreAtt.score();
+ return scoreAtt.score();
+ }
+ return 0;
+ }
+
+ @Override
+ public float getFragmentScore() {
+ return fragmentScore;
+ }
+}
Index: lucene/src/java/org/apache/lucene/search/positions/OrderedConjunctionPositionIterator.java
===================================================================
--- lucene/src/java/org/apache/lucene/search/positions/OrderedConjunctionPositionIterator.java (revision 1150180)
+++ lucene/src/java/org/apache/lucene/search/positions/OrderedConjunctionPositionIterator.java (working copy)
@@ -90,7 +90,7 @@
}
@Override
- public void collect() {
+ public void collect() throws IOException {
collector.collectComposite(scorer, interval, currentDoc);
for (PositionIntervalIterator iter : iterators) {
iter.collect();
Index: lucene/src/java/org/apache/lucene/search/positions/ConjunctionPositionIterator.java
===================================================================
--- lucene/src/java/org/apache/lucene/search/positions/ConjunctionPositionIterator.java (revision 1150180)
+++ lucene/src/java/org/apache/lucene/search/positions/ConjunctionPositionIterator.java (working copy)
@@ -83,7 +83,7 @@
}
@Override
- public void collect() {
+ public void collect() throws IOException {
collector.collectComposite(scorer, queue.queueInterval, currentDoc);
for (PositionIntervalIterator iter : iterators) {
iter.collect();
Index: lucene/src/java/org/apache/lucene/search/positions/RangePositionsIterator.java
===================================================================
--- lucene/src/java/org/apache/lucene/search/positions/RangePositionsIterator.java (revision 1150180)
+++ lucene/src/java/org/apache/lucene/search/positions/RangePositionsIterator.java (working copy)
@@ -64,7 +64,7 @@
}
@Override
- public void collect() {
+ public void collect() throws IOException {
collector.collectComposite(null, interval, iterator.docID());
iterator.collect();
}
Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java
===================================================================
--- lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java (revision 1150180)
+++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java (working copy)
@@ -221,8 +221,10 @@
textFragmenter.start(text, tokenStream);
TokenGroup tokenGroup=new TokenGroup(tokenStream);
-
- for (boolean next = tokenStream.incrementToken(); next && (offsetAtt.startOffset()< maxDocCharsToAnalyze);
+ boolean next = tokenStream.incrementToken();
+ // start the first fragment at the beginning of the first token
+ lastEndOffset = offsetAtt.startOffset();
+ for (; next && (offsetAtt.startOffset()< maxDocCharsToAnalyze);
next = tokenStream.incrementToken())
{
if( (offsetAtt.endOffset()>text.length())
Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/poshighlight/SingleDocFilter.java
===================================================================
--- lucene/contrib/highlighter/src/java/org/apache/lucene/search/poshighlight/SingleDocFilter.java (revision 0)
+++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/poshighlight/SingleDocFilter.java (revision 0)
@@ -0,0 +1,67 @@
+package org.apache.lucene.search.poshighlight;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.index.IndexReader.AtomicReaderContext;
+import org.apache.lucene.search.DocIdSet;
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.search.Filter;
+
+/**
+ * Filter that matches a single document.
+ *
+ * @lucene.experimental
+ *
+ */
+public class SingleDocFilter extends Filter {
+ private int doc;
+
+ public SingleDocFilter (int doc) {
+ this.doc = doc;
+ }
+
+ @Override
+ public DocIdSet getDocIdSet(AtomicReaderContext context) {
+ return new DocIdSet () {
+ public DocIdSetIterator iterator() {
+ return new DocIdSetIterator () {
+ int curr = -1;
+ public int docID() {
+ return curr;
+ }
+ public int nextDoc() throws IOException {
+ if (curr < 0)
+ curr = doc;
+ else
+ curr = NO_MORE_DOCS;
+ return curr;
+ }
+ @Override
+ public int advance(int target) throws IOException {
+ while (nextDoc() < target)
+ ;
+ return curr;
+ }
+ };
+ };
+ };
+ };
+
+}
Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/poshighlight/PosOffset.java
===================================================================
--- lucene/contrib/highlighter/src/java/org/apache/lucene/search/poshighlight/PosOffset.java (revision 0)
+++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/poshighlight/PosOffset.java (revision 0)
@@ -0,0 +1,55 @@
+package org.apache.lucene.search.poshighlight;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Stores a position interval and its bounding offsets.
+ * @lucene.experimental
+ */
+class PosOffset {
+
+ public int getStartPosition() {
+ return startPosition;
+ }
+
+ public int getEndPosition() {
+ return endPosition;
+ }
+
+ public int getStartOffset() {
+ return startOffset;
+ }
+
+ public int getEndOffset() {
+ return endOffset;
+ }
+
+ int startPosition;
+ int endPosition;
+ int startOffset;
+ int endOffset;
+
+ public PosOffset(int startPosition, int endPosition, int startOffset,
+ int endOffset) {
+ this.startPosition = startPosition;
+ this.endPosition = endPosition;
+ this.startOffset = startOffset;
+ this.endOffset = endOffset;
+ }
+
+}
\ No newline at end of file
Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/poshighlight/PositionIntervalArrayIterator.java
===================================================================
--- lucene/contrib/highlighter/src/java/org/apache/lucene/search/poshighlight/PositionIntervalArrayIterator.java (revision 1150180)
+++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/poshighlight/PositionIntervalArrayIterator.java (working copy)
@@ -1,61 +0,0 @@
-package org.apache.lucene.search.poshighlight;
-
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.io.IOException;
-
-import org.apache.lucene.search.positions.PositionIntervalIterator;
-
-/**
- * Present an array of PositionIntervals as an Iterator.
- * @lucene.experimental
- */
-public class PositionIntervalArrayIterator extends PositionIntervalIterator {
-
- private int next = 0;
- private int count;
- private PositionInterval[] positions;
-
- public PositionIntervalArrayIterator (PositionInterval[] positions, int count) {
- super(null);
- this.positions = positions;
- this.count = count;
- }
-
- @Override
- public PositionInterval next() {
- if (next >= count)
- return null;
- return positions[next++];
- }
-
- @Override
- public PositionIntervalIterator[] subs(boolean inOrder) {
- return EMPTY;
- }
-
- @Override
- public void collect() {
- }
-
- @Override
- public int advanceTo(int docId) throws IOException {
- return 0;
- }
-
-}
\ No newline at end of file
Index: lucene/contrib/highlighter/src/test/org/apache/lucene/search/poshighlight/PosHighlighterTest.java
===================================================================
--- lucene/contrib/highlighter/src/test/org/apache/lucene/search/poshighlight/PosHighlighterTest.java (revision 1150180)
+++ lucene/contrib/highlighter/src/test/org/apache/lucene/search/poshighlight/PosHighlighterTest.java (working copy)
@@ -1,11 +1,28 @@
package org.apache.lucene.search.poshighlight;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
import java.io.IOException;
+import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.MockAnalyzer;
-import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Index;
@@ -20,25 +37,20 @@
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
-import org.apache.lucene.search.MultiTermQuery;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.WildcardQuery;
-import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
-import org.apache.lucene.search.highlight.SimpleFragmenter;
import org.apache.lucene.search.highlight.TextFragment;
import org.apache.lucene.search.positions.PositionFilterQuery;
import org.apache.lucene.search.positions.TestBlockPositionsIterator.BlockPositionIteratorFilter;
import org.apache.lucene.store.Directory;
-import org.apache.lucene.store.SimpleFSDirectory;
import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util.Version;
/**
- * TODO:
- * Phrase and Span Queries
- * positions callback API
+ * @lucene.experimental
*/
public class PosHighlighterTest extends LuceneTestCase {
@@ -46,6 +58,7 @@
protected Analyzer analyzer;
protected Directory dir;
protected IndexSearcher searcher;
+ protected boolean useOffsetPayloads;
private static final String PORRIDGE_VERSE =
"Pease porridge hot! Pease porridge cold! Pease porridge in the pot nine days old! Some like it hot, some"
@@ -54,8 +67,9 @@
@Override
public void setUp() throws Exception {
super.setUp();
- analyzer = new MockAnalyzer(random, MockTokenizer.WHITESPACE, false);
+ analyzer = new WhitespaceAnalyzer(Version.LUCENE_40);
dir = newDirectory();
+ useOffsetPayloads = true;
}
@Override
@@ -82,6 +96,13 @@
for( String value: values ) {
Document doc = new Document();
Field f = new Field (F, value, Store.YES, Index.ANALYZED, TermVector.WITH_POSITIONS_OFFSETS);
+ TokenStream tokens = analyzer.tokenStream(F, new StringReader (value));
+ if (useOffsetPayloads) {
+ OffsetPayloadWriter tokensWithOffsets = new OffsetPayloadWriter(tokens);
+ f.setTokenStream(tokensWithOffsets);
+ } else {
+ f.setTokenStream(tokens);
+ }
doc.add (f);
writer.addDocument( doc );
}
@@ -94,51 +115,21 @@
return doSearch(q, 100);
}
- private class ConstantScorer implements org.apache.lucene.search.highlight.Scorer {
-
- @Override
- public TokenStream init(TokenStream tokenStream) throws IOException {
- return tokenStream;
- }
-
- @Override
- public void startFragment(TextFragment newFragment) {
- }
-
- @Override
- public float getTokenScore() {
- return 1;
- }
-
- @Override
- public float getFragmentScore() {
- return 1;
- }
- }
-
private String[] doSearch(Query q, int maxFragSize) throws IOException, InvalidTokenOffsetsException {
return doSearch (q, maxFragSize, 0);
}
- private String[] doSearch(Query q, int maxFragSize, int docIndex) throws IOException, InvalidTokenOffsetsException {
- // ConstantScorer is a fragment Scorer, not a search result (document) Scorer
- Highlighter highlighter = new Highlighter (new ConstantScorer());
+
+ private String[] doSearch(Query q, int maxFragSize, int docid) throws IOException, InvalidTokenOffsetsException {
+ // PosScorer is a fragment Scorer, not a search result (document) Scorer
+ PosHighlighter highlighter = new PosHighlighter ();
highlighter.setTextFragmenter(new SimpleFragmenter(maxFragSize));
- PosCollector collector = new PosCollector(10);
- if (q instanceof MultiTermQuery) {
- ((MultiTermQuery)q).setRewriteMethod (MultiTermQuery.CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE);
- }
- searcher.search(q, collector);
- ScorePosDoc doc = collector.docs[docIndex];
- if (doc == null)
+ //highlighter.setTextFragmenter(new SimpleFragmenter(maxFragSize));
+
+ String text = searcher.getIndexReader().document(docid).getFieldable(F).stringValue();
+ TokenStream tstream = PosHighlighter.getPosTokenStream (q, docid, useOffsetPayloads ? null : F, searcher, text);
+ if (tstream == null)
return null;
- String text = searcher.getIndexReader().document(doc.doc).getFieldable(F).stringValue();
- PositionOffsetMapper pom = new PositionOffsetMapper ();
- // FIXME: test error cases: for non-stored fields, and fields w/no term vectors
- searcher.getIndexReader().getTermFreqVector(doc.doc, F, pom);
-
- TextFragment[] fragTexts = highlighter.getBestTextFragments(new PosTokenStream
- (text, new PositionIntervalArrayIterator(doc.sortedPositions(), doc.posCount), pom),
- text, false, 10);
+ TextFragment[] fragTexts = highlighter.getBestTextFragments (tstream, text, false, 10);
String[] frags = new String[fragTexts.length];
for (int i = 0; i < frags.length; i++)
frags[i] = fragTexts[i].toString();
@@ -152,9 +143,9 @@
}
public void testSeveralSnippets () throws Exception {
- String input = "this is some long text. It has the word long in many places. In fact, it has long on some different fragments. " +
+ String input = "this is some long text. It has the word long in many places. " +
"Let us see what happens to long in this case.";
- String gold = "this is some long text. It has the word long in many places. In fact, it has long on some different fragments. " +
+ String gold = "this is some long text. It has the word long in many places. " +
"Let us see what happens to long in this case.";
insertDocs(analyzer, input);
String frags[] = doSearch (new TermQuery(new Term(F, "long")), input.length());
@@ -231,10 +222,7 @@
// make sure we highlight the phrase, and not the terms outside the phrase
assertEquals ("is it that this is a test, is it", frags[0]);
}
-
- /*
- * Failing ... PhraseQuery scorer needs positions()?
- */
+
public void testPhraseOriginal() throws Exception {
insertDocs(analyzer, "This is a test");
PhraseQuery pq = new PhraseQuery();
@@ -270,9 +258,9 @@
BooleanQuery bq = new BooleanQuery();
bq.add(new BooleanClause (new TermQuery(new Term(F, "Pease")), Occur.MUST));
bq.add(new BooleanClause (new TermQuery(new Term(F, "porridge")), Occur.MUST));
- String frags[] = doSearch (bq, 50, 0);
+ String frags[] = doSearch (bq, 50, 1);
assertEquals ("Pease porridge hot! Pease porridge cold! Pease", frags[0]);
- frags = doSearch (bq, 50, 1);
+ frags = doSearch (bq, 50, 2);
assertEquals ("This document has some Pease porridge in it", frags[0]);
}
@@ -288,10 +276,48 @@
BooleanQuery bq = new BooleanQuery();
bq.add(new BooleanClause (new TermQuery(new Term(F, "Pease")), Occur.SHOULD));
bq.add(new BooleanClause (new TermQuery(new Term(F, "porridge")), Occur.SHOULD));
- String frags[] = doSearch (bq, 50, 0);
+ String frags[] = doSearch (bq, 50, 1);
assertEquals ("Pease porridge hot! Pease porridge cold! Pease", frags[0]);
- frags = doSearch (bq, 50, 1);
+ frags = doSearch (bq, 50, 2);
assertEquals ("This document has some Pease porridge in it", frags[0]);
}
+
+ public void testLongishDocument() throws Exception {
+ StringBuilder buf = new StringBuilder ();
+ for (int i = 0 ; i < 1000; i++)
+ buf.append ("dummy ");
+ buf.append ("This document has some Pease porridge in it");
+ insertDocs(analyzer, buf.toString());
+
+ BooleanQuery bq = new BooleanQuery();
+ bq.add(new BooleanClause (new TermQuery(new Term(F, "Pease")), Occur.SHOULD));
+ bq.add(new BooleanClause (new TermQuery(new Term(F, "porridge")), Occur.SHOULD));
+ String frags[] = doSearch (bq, 50, 0);
+ assertEquals ("y This document has some Pease porridge in it", frags[0]);
+ insertDocs(analyzer, buf.toString());
+ for (int i = 0 ; i < 1000; i++)
+ buf.append (" dummy");
+ insertDocs(analyzer, buf.toString());
+ frags = doSearch (bq, 50, 0);
+ assertEquals ("y This document has some Pease porridge", frags[0]);
+ frags = doSearch (bq, 75, 0);
+ assertEquals ("y This document has some Pease porridge in it dummy dummy dummy ", frags[0]);
+ }
+
+ public void testFragmentation() throws Exception {
+ StringBuilder buf = new StringBuilder ();
+ for (int i = 0 ; i < 1000; i++) {
+ buf.append (Integer.toString(i));
+ buf.append(' ');
+ }
+ insertDocs(analyzer, buf.toString());
+ BooleanQuery bq = new BooleanQuery();
+ bq.add(new BooleanClause (new TermQuery(new Term(F, "10")), Occur.SHOULD));
+ bq.add(new BooleanClause (new TermQuery(new Term(F, "500")), Occur.SHOULD));
+ String[] frags = doSearch (bq, 75, 0);
+ assertEquals ("0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 ", frags[0]);
+ assertEquals (" 494 495 496 497 498 499 500 501 502 503 504 505 506 ", frags[1]);
+ }
+
}
Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/poshighlight/PosTokenStream.java
===================================================================
--- lucene/contrib/highlighter/src/java/org/apache/lucene/search/poshighlight/PosTokenStream.java (revision 1149428)
+++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/poshighlight/PosTokenStream.java (working copy)
@@ -1,75 +0,0 @@
-package org.apache.lucene.search.poshighlight;
-
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.io.IOException;
-
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
-import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
-import org.apache.lucene.search.positions.PositionIntervalIterator;
-import org.apache.lucene.search.positions.PositionIntervalIterator.PositionInterval;
-
-/**
- * A TokenStream constructed from a stream of positions and their offsets.
- * The document is segmented into tokens at the start and end offset of each interval. The intervals
- * are assumed to be non-overlapping.
- *
- * TODO: abstract the dependency on the current PositionOffsetMapper impl;
- * allow for implementations of position->offset maps that don't rely on term vectors.
- *
- * @lucene.experimental
- */
-public class PosTokenStream extends TokenStream {
-
- //this tokenizer generates four attributes:
- // term, offset, positionIncrement? and type?
- private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
- private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
- private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
- //private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
- private final String text;
- private final PositionIntervalIterator positions;
-
- // the index of the current position interval
- private PositionInterval pos = null;
- private final PositionOffsetMapper pom;
-
- public PosTokenStream (String text, PositionIntervalIterator positions, PositionOffsetMapper pom) {
- this.text = text;
- this.positions = positions;
- this.pom = pom;
- }
-
- @Override
- public final boolean incrementToken() throws IOException {
- pos = positions.next();
- if (pos == null){
- return false;
- }
- int b, e;
- b = pom.getStartOffset(pos.begin);
- e = pom.getEndOffset(pos.end);
- termAtt.append(text, b, e);
- offsetAtt.setOffset(b, e);
- posIncrAtt.setPositionIncrement(1);
- return true;
- }
-
-}
Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/poshighlight/PosTokenStream.java
===================================================================
--- lucene/contrib/highlighter/src/java/org/apache/lucene/search/poshighlight/PosTokenStream.java (revision 0)
+++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/poshighlight/PosTokenStream.java (revision 1149428)
@@ -0,0 +1,154 @@
+package org.apache.lucene.search.poshighlight;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.Iterator;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+
+/**
+ * A TokenStream constructed from a stream of position intervals and their offsets.
+ * The document is segmented into tokens at the start and end offset of each interval. The intervals
+ * are assumed to be non-overlapping. Additional non-scoring tokens are generated wrapping before
+ * the first and after the final interval to enable a fragmenting highlighter to include surrounding text
+ * without the need to analyze the entire document.
+ * The slop parameter controls the number of these surrounding tokens.
+ *
+ * @lucene.experimental
+ */
+public class PosTokenStream extends TokenStream {
+
+ private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+ private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
+ private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
+ private final ScoreAttribute scoreAtt = addAttribute(ScoreAttribute.class);
+ private final String text;
+ private final Iterator poIter;
+ private final int slop;
+ private State state;
+ private PosOffset po = null;
+ private int lastEndOffset=0;
+
+ public PosTokenStream (String text, PosOffsetMap positions) {
+ this (text, positions, 25);
+ }
+
+ public PosTokenStream (String text, PosOffsetMap positions, int slop) {
+ this.slop = slop;
+ this.text = text;
+ this.poIter = positions.iterator();
+ this.state = State.Head;
+ }
+
+ /**
+ * Goes through these states:
+ *
+ * - Head: text < first match start - slop
+ * - Garbage: text in between matches outside of slop
+ * - Preamble: slop chars before a match
+ * - Matches: tokens from position interval iterator
+ * - Postscript: slop chars after the last match
+ * - Tail: text > last match end + slop
+ * - Done
+ *
+ */
+
+ enum State {
+ Head, Garbage, Preamble, Matches, Postscript, Tail, Done
+ }
+
+ // TODO: break slop at whitespace at (frag size - match size) / 2
+ @Override
+ public final boolean incrementToken() throws IOException {
+
+ int b, e;
+ switch(state) {
+ case Head:
+ if (nextPos() == null) {
+ return false;
+ }
+ case Garbage:
+ state = State.Preamble;
+ b = lastEndOffset;
+ e = po.startOffset - slop;
+ if (e > 0) {
+ scoreAtt.setScore(0);
+ break;
+ } // else fall through ...
+ case Preamble:
+ state = State.Matches;
+ if (po.startOffset > 0) {
+ b = Math.max(0, po.startOffset - slop);
+ lastEndOffset = e = po.startOffset;
+ scoreAtt.setScore(0);
+ break;
+ } // else fall through ...
+ case Matches:
+ if (po.startOffset < lastEndOffset + slop) {
+ b = po.startOffset;
+ lastEndOffset = e = po.endOffset;
+ scoreAtt.setScore(1);
+ if (nextPos()== null) {
+ state = State.Postscript;
+ }
+ } else {
+ b = lastEndOffset;
+ lastEndOffset = e = Math.min(b + slop, po.startOffset - slop);
+ scoreAtt.setScore(0);
+ state = State.Garbage;
+ }
+ break;
+ case Postscript:
+ state = State.Tail;
+ if (lastEndOffset >= text.length())
+ return false;
+ b = lastEndOffset;
+ e = Math.min (lastEndOffset + slop, text.length());
+ lastEndOffset = e;
+ scoreAtt.setScore(0);
+ break;
+ case Tail:
+ state = State.Done;
+ if (lastEndOffset >= text.length())
+ return false;
+ b = lastEndOffset;
+ e = text.length();
+ scoreAtt.setScore(0);
+ break;
+ case Done:
+ default: // compiler obeisance
+ return false;
+ }
+ termAtt.setEmpty();
+ termAtt.append(text, b, e);
+ offsetAtt.setOffset(b, e);
+ posIncrAtt.setPositionIncrement(1);
+ return true;
+ }
+
+ private PosOffset nextPos () throws IOException {
+ if (!poIter.hasNext())
+ return null;
+ po = poIter.next();
+ return po;
+ }
+}
Index: lucene/src/java/org/apache/lucene/search/positions/PositionIntervalIterator.java
===================================================================
--- lucene/src/java/org/apache/lucene/search/positions/PositionIntervalIterator.java (revision 1149428)
+++ lucene/src/java/org/apache/lucene/search/positions/PositionIntervalIterator.java (working copy)
@@ -1,135 +0,0 @@
-package org.apache.lucene.search.positions;
-
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-import java.io.IOException;
-
-import org.apache.lucene.search.Scorer;
-import org.apache.lucene.util.BytesRef;
-
-/**
- *
- * @lucene.experimental
- */ // nocommit - javadoc
-public abstract class PositionIntervalIterator {
-
- public static final PositionIntervalIterator[] EMPTY = new PositionIntervalIterator[0];
- public static final int NO_MORE_DOCS = Integer.MAX_VALUE;
- public static final PositionCollector EMPTY_COLLECTOR = new PositionCollector() {
-
- @Override
- public void collectLeafPosition(Scorer scorer, PositionInterval interval,
- int docID) {
- }
-
- @Override
- public void collectComposite(Scorer scorer, PositionInterval interval,
- int docID) {
- }
-
- };
-
- protected int currentDoc = -1;
- protected final Scorer scorer;
- protected PositionCollector collector = EMPTY_COLLECTOR;
-
- public PositionIntervalIterator(Scorer scorer) {
- this.scorer = scorer;
- }
-
- public abstract int advanceTo(int docId) throws IOException;
-
- public abstract PositionInterval next() throws IOException;
-
- public void setPositionCollector(PositionCollector collector) {
- if (collector == null) {
- throw new IllegalArgumentException("PositionCollector must not be null");
- }
- this.collector = collector;
- PositionIntervalIterator[] subs = subs(false);
- for (PositionIntervalIterator positionIntervalIterator : subs) {
- positionIntervalIterator.setPositionCollector(collector);
- }
- }
-
-
- public abstract void collect();
-
- public abstract PositionIntervalIterator[] subs(boolean inOrder);
-
- public int docID() {
- return currentDoc;
- }
-
- public Scorer getScorer() {
- return scorer;
- }
-
- public static interface PositionIntervalFilter {
- public abstract PositionIntervalIterator filter(
- PositionIntervalIterator iter);
- }
-
- public static class PositionInterval implements Cloneable {
-
- public int begin;
- public int end;
-
- public PositionInterval(int begin, int end) {
- this.begin = begin;
- this.end = end;
- }
-
- public PositionInterval() {
- this(0, 0);
- }
-
- public boolean nextPayload(BytesRef ref) throws IOException {
- return false;
- }
-
- public boolean payloadAvailable() {
- return false;
- }
-
- public void reset() {
- begin = end = -1;
- }
-
- @Override
- public Object clone() {
- try {
- return super.clone();
- } catch (CloneNotSupportedException e) {
- throw new RuntimeException(); // should not happen
- }
- }
-
- @Override
- public String toString() {
- return "PositionInterval [begin=" + begin + ", end=" + end + "]";
- }
-
- }
-
- public static interface PositionCollector {
- public void collectLeafPosition(Scorer scorer, PositionInterval interval, int docID);
- public void collectComposite(Scorer scorer, PositionInterval interval, int docID);
-
- }
-
-}
Index: lucene/src/java/org/apache/lucene/search/positions/PositionIntervalIterator.java
===================================================================
--- lucene/src/java/org/apache/lucene/search/positions/PositionIntervalIterator.java (revision 0)
+++ lucene/src/java/org/apache/lucene/search/positions/PositionIntervalIterator.java (revision 1149428)
@@ -0,0 +1,135 @@
+package org.apache.lucene.search.positions;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+import java.io.IOException;
+
+import org.apache.lucene.search.Scorer;
+import org.apache.lucene.util.BytesRef;
+
+/**
+ *
+ * @lucene.experimental
+ */ // nocommit - javadoc
+public abstract class PositionIntervalIterator {
+
+ public static final PositionIntervalIterator[] EMPTY = new PositionIntervalIterator[0];
+ public static final int NO_MORE_DOCS = Integer.MAX_VALUE;
+ public static final PositionCollector EMPTY_COLLECTOR = new PositionCollector() {
+
+ @Override
+ public void collectLeafPosition(Scorer scorer, PositionInterval interval,
+ int docID) {
+ }
+
+ @Override
+ public void collectComposite(Scorer scorer, PositionInterval interval,
+ int docID) {
+ }
+
+ };
+
+ protected int currentDoc = -1;
+ protected final Scorer scorer;
+ protected PositionCollector collector = EMPTY_COLLECTOR;
+
+ public PositionIntervalIterator(Scorer scorer) {
+ this.scorer = scorer;
+ }
+
+ public abstract int advanceTo(int docId) throws IOException;
+
+ public abstract PositionInterval next() throws IOException;
+
+ public void setPositionCollector(PositionCollector collector) {
+ if (collector == null) {
+ throw new IllegalArgumentException("PositionCollector must not be null");
+ }
+ this.collector = collector;
+ PositionIntervalIterator[] subs = subs(false);
+ for (PositionIntervalIterator positionIntervalIterator : subs) {
+ positionIntervalIterator.setPositionCollector(collector);
+ }
+ }
+
+
+ public abstract void collect() throws IOException;
+
+ public abstract PositionIntervalIterator[] subs(boolean inOrder);
+
+ public int docID() {
+ return currentDoc;
+ }
+
+ public Scorer getScorer() {
+ return scorer;
+ }
+
+ public static interface PositionIntervalFilter {
+ public abstract PositionIntervalIterator filter(
+ PositionIntervalIterator iter);
+ }
+
+ public static class PositionInterval implements Cloneable {
+
+ public int begin;
+ public int end;
+
+ public PositionInterval(int begin, int end) {
+ this.begin = begin;
+ this.end = end;
+ }
+
+ public PositionInterval() {
+ this(0, 0);
+ }
+
+ public boolean nextPayload(BytesRef ref) throws IOException {
+ return false;
+ }
+
+ public boolean payloadAvailable() {
+ return false;
+ }
+
+ public void reset() {
+ begin = end = -1;
+ }
+
+ @Override
+ public Object clone() {
+ try {
+ return super.clone();
+ } catch (CloneNotSupportedException e) {
+ throw new RuntimeException(); // should not happen
+ }
+ }
+
+ @Override
+ public String toString() {
+ return "PositionInterval [begin=" + begin + ", end=" + end + "]";
+ }
+
+ }
+
+ public static interface PositionCollector {
+ public void collectLeafPosition(Scorer scorer, PositionInterval interval, int docID) throws IOException;
+ public void collectComposite(Scorer scorer, PositionInterval interval, int docID) throws IOException;
+
+ }
+
+}
Index: lucene/src/java/org/apache/lucene/search/positions/PositionFilterQuery.java
===================================================================
--- lucene/src/java/org/apache/lucene/search/positions/PositionFilterQuery.java (revision 1150180)
+++ lucene/src/java/org/apache/lucene/search/positions/PositionFilterQuery.java (working copy)
@@ -195,7 +195,7 @@
}
@Override
- public void collect() {
+ public void collect() throws IOException {
other.collect();
}
Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/poshighlight/SimpleFragmenter.java
===================================================================
--- lucene/contrib/highlighter/src/java/org/apache/lucene/search/poshighlight/SimpleFragmenter.java (revision 0)
+++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/poshighlight/SimpleFragmenter.java (revision 0)
@@ -0,0 +1,62 @@
+package org.apache.lucene.search.poshighlight;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.search.highlight.Fragmenter;
+
+/**
+ * Fragments text in fixed size chunks. Like {@see SimpleFragmenter},
+ * but not confused by very large chunks.
+ *
+ * @lucene.experimental
+ */
+public class SimpleFragmenter implements Fragmenter {
+ private static final int DEFAULT_FRAGMENT_SIZE = 100;
+ private OffsetAttribute offsetAtt;
+ private int fragmentSize;
+ private int lastFragStart;
+
+ public SimpleFragmenter() {
+ this(DEFAULT_FRAGMENT_SIZE);
+ }
+
+ /**
+ *
+ * @param fragmentSize size in number of characters of each fragment
+ */
+ public SimpleFragmenter(int fragmentSize) {
+ this.fragmentSize = fragmentSize;
+ }
+
+ @Override
+ public void start(String originalText, TokenStream stream) {
+ offsetAtt = stream.addAttribute(OffsetAttribute.class);
+ lastFragStart = 0;
+ }
+
+ @Override
+ public boolean isNewFragment () {
+ boolean isNewFrag = (offsetAtt.endOffset() - lastFragStart > fragmentSize);
+ if (isNewFrag) {
+ lastFragStart = offsetAtt.startOffset();
+ }
+ return isNewFrag;
+ }
+}
Index: lucene/src/java/org/apache/lucene/search/TermScorer.java
===================================================================
--- lucene/src/java/org/apache/lucene/search/TermScorer.java (revision 1150180)
+++ lucene/src/java/org/apache/lucene/search/TermScorer.java (working copy)
@@ -211,7 +211,7 @@
}
@Override
- public void collect() {
+ public void collect() throws IOException {
collector.collectLeafPosition(scorer, interval, docID);
}
Index: lucene/src/java/org/apache/lucene/search/positions/BlockPositionIterator.java
===================================================================
--- lucene/src/java/org/apache/lucene/search/positions/BlockPositionIterator.java (revision 1150180)
+++ lucene/src/java/org/apache/lucene/search/positions/BlockPositionIterator.java (working copy)
@@ -119,7 +119,7 @@
}
@Override
- public void collect() {
+ public void collect() throws IOException {
collector.collectComposite(scorer, interval, currentDoc);
for (PositionIntervalIterator iter : iterators) {
iter.collect();
Index: lucene/src/java/org/apache/lucene/search/positions/DisjunctionPositionIterator.java
===================================================================
--- lucene/src/java/org/apache/lucene/search/positions/DisjunctionPositionIterator.java (revision 1150180)
+++ lucene/src/java/org/apache/lucene/search/positions/DisjunctionPositionIterator.java (working copy)
@@ -68,7 +68,7 @@
}
@Override
- public void collect() {
+ public void collect() throws IOException {
collector.collectComposite(scorer, queue.queueInterval, currentDoc);
iterators[queue.top().index].collect();
}
Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/poshighlight/ScoreAttribute.java
===================================================================
--- lucene/contrib/highlighter/src/java/org/apache/lucene/search/poshighlight/ScoreAttribute.java (revision 0)
+++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/poshighlight/ScoreAttribute.java (revision 0)
@@ -0,0 +1,36 @@
+package org.apache.lucene.search.poshighlight;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.util.Attribute;
+
+/**
+ * A Token's score. The default value is 0.
+ *
+ * @lucene.experimental
+ */
+
+public interface ScoreAttribute extends Attribute {
+
+ /** Returns this Token's score. Defaults to 0. */
+ public float score();
+
+ /** Set the score.
+ @see #score() */
+ public void setScore(float score);
+}
Index: lucene/src/java/org/apache/lucene/search/IndexSearcher.java
===================================================================
--- lucene/src/java/org/apache/lucene/search/IndexSearcher.java (revision 1150180)
+++ lucene/src/java/org/apache/lucene/search/IndexSearcher.java (working copy)
@@ -564,7 +564,7 @@
assert filter != null;
- Scorer scorer = weight.scorer(context, ScorerContext.def());
+ Scorer scorer = weight.scorer(context, ScorerContext.def().needsPositions(collector.needsPositions()).needsPayloads(collector.needsPayloads()));
if (scorer == null) {
return;
}
Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/poshighlight/ScoreAttributeImpl.java
===================================================================
--- lucene/contrib/highlighter/src/java/org/apache/lucene/search/poshighlight/ScoreAttributeImpl.java (revision 0)
+++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/poshighlight/ScoreAttributeImpl.java (revision 0)
@@ -0,0 +1,83 @@
+package org.apache.lucene.search.poshighlight;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.util.AttributeImpl;
+
+/**
+ * A Token's score. The default value is 0.
+ *
+ * @lucene.experimental
+ */
+
+public class ScoreAttributeImpl extends AttributeImpl implements ScoreAttribute {
+
+ private float score = 0;
+
+ public ScoreAttributeImpl () {
+ this (0);
+ }
+
+ public ScoreAttributeImpl(float score) {
+ this.score = score;
+ }
+
+ /** @return this Token's score. Defaults to 0. */
+ @Override
+ public float score() {
+ return score;
+ }
+
+ /** Set the score.
+ @see #score() */
+ @Override
+ public void setScore(float score) {
+ this.score = score;
+ }
+
+ @Override
+ public void clear() {
+ score = 0;
+ }
+
+ @Override
+ public boolean equals(Object other) {
+ if (other == this) {
+ return true;
+ }
+
+ if (other instanceof ScoreAttributeImpl) {
+ final ScoreAttributeImpl o = (ScoreAttributeImpl) other;
+ return (this.score == o.score);
+ }
+
+ return false;
+ }
+
+ @Override
+ public int hashCode() {
+ return Float.valueOf(score).hashCode();
+ }
+
+ @Override
+ public void copyTo(AttributeImpl target) {
+ ScoreAttribute t = (ScoreAttribute) target;
+ t.setScore(score);
+ }
+
+}
Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/poshighlight/PositionOffsetMapper.java
===================================================================
--- lucene/contrib/highlighter/src/java/org/apache/lucene/search/poshighlight/PositionOffsetMapper.java (revision 1150180)
+++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/poshighlight/PositionOffsetMapper.java (working copy)
@@ -1,73 +0,0 @@
-package org.apache.lucene.search.poshighlight;
-
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import org.apache.lucene.index.TermVectorMapper;
-import org.apache.lucene.index.TermVectorOffsetInfo;
-import org.apache.lucene.util.ArrayUtil;
-import org.apache.lucene.util.BytesRef;
-
-/**
- * Create a map of position->offsets using term vectors. TODO: In highlighting, we don't really need the
- * entire map; make a sparse map including only required positions.
- *
- * @lucene.experimental
- */
-
-public class PositionOffsetMapper extends TermVectorMapper {
- private int maxPos = 0;
- private static final int BUF_SIZE = 128;
- int startOffset[] = new int[BUF_SIZE], endOffset[] = new int[BUF_SIZE];
-
- public void setExpectations(String field, int numTerms,
- boolean storeOffsets, boolean storePositions) {
- }
-
- public void map(BytesRef term, int frequency,
- TermVectorOffsetInfo[] offsets, int[] positions)
- {
- for (int i = 0; i < positions.length; i++) {
- int pos = positions[i];
- if (pos >= startOffset.length) {
- grow (pos + BUF_SIZE);
- maxPos = pos;
- } else if (pos > maxPos) {
- maxPos = pos;
- }
- startOffset[pos] = offsets[i].getStartOffset();
- endOffset[pos] = offsets[i].getEndOffset();
- }
- }
-
- private void grow (int size) {
- startOffset = ArrayUtil.grow (startOffset, size);
- endOffset = ArrayUtil.grow (endOffset, size);
- }
-
- public int getStartOffset(int pos) {
- return startOffset[pos];
- }
-
- public int getEndOffset(int pos) {
- return endOffset[pos];
- }
-
- public int getMaxPosition() {
- return maxPos;
- }
-}
\ No newline at end of file
Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/poshighlight/PosOffsetMap.java
===================================================================
--- lucene/contrib/highlighter/src/java/org/apache/lucene/search/poshighlight/PosOffsetMap.java (revision 1149428)
+++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/poshighlight/PosOffsetMap.java (working copy)
@@ -18,44 +18,76 @@
*/
import java.io.IOException;
+import java.util.Iterator;
+import java.util.SortedMap;
+import java.util.TreeMap;
-import org.apache.lucene.search.positions.PositionIntervalIterator;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.TermVectorMapper;
+import org.apache.lucene.index.TermVectorOffsetInfo;
+import org.apache.lucene.util.BytesRef;
/**
- * Present an array of PositionIntervals as an Iterator.
+ * Maps positions to offsets. Provides an Iterator sorted by start position. Also provides
+ * direct lookup of offsets given start position. The mapping may be built up using {@link #put(int, int, int, int)}
+ * or retrieved from term vectors in the index using {@link #getTermVectorOffsets(IndexReader, int, String)}.
+ *
* @lucene.experimental
*/
-public class PositionIntervalArrayIterator extends PositionIntervalIterator {
+public class PosOffsetMap {
- private int next = 0;
- private int count;
- private PositionInterval[] positions;
-
- public PositionIntervalArrayIterator (PositionInterval[] positions, int count) {
- super(null);
- this.positions = positions;
- this.count = count;
+ private SortedMap posOffsetMap;
+
+ public PosOffsetMap() {
+ this.posOffsetMap = new TreeMap();
}
-
- @Override
- public PositionInterval next() {
- if (next >= count)
- return null;
- return positions[next++];
+
+ public void put(int startPosition, int endPosition, int startOffset, int endOffset) {
+ PosOffset po = new PosOffset(startPosition, endPosition, startOffset, endOffset);
+ posOffsetMap.put(startPosition, po);
}
- @Override
- public PositionIntervalIterator[] subs(boolean inOrder) {
- return EMPTY;
+ public boolean containsKey(int begin) {
+ return posOffsetMap.containsKey(begin);
}
- @Override
- public void collect() {
+ public PosOffset getPosOffset (int pos) {
+ return posOffsetMap.get(pos);
+ }
+
+ public Iterator iterator() {
+ return posOffsetMap.values().iterator();
}
- @Override
- public int advanceTo(int docId) throws IOException {
- return 0;
+ public void getTermVectorOffsets (IndexReader reader, int docid, String fieldName) throws IOException {
+ reader.getTermFreqVector(docid, fieldName, new OffsetMapper());
}
+ private class OffsetMapper extends TermVectorMapper {
+
+ public void setExpectations(String field, int numTerms,
+ boolean storeOffsets, boolean storePositions) {
+ }
+
+ public void map(BytesRef term, int frequency,
+ TermVectorOffsetInfo[] offsets, int[] positions) {
+ for (int i = 0; i < positions.length; i++) {
+ int pos = positions[i];
+ PosOffset po = posOffsetMap.get(pos);
+ if (po != null) {
+ po.startOffset = offsets[i].getStartOffset();
+ po.endOffset = offsets[i].getEndOffset();
+ }
+ }
+ }
+
+ }
+
+ public TokenStream getPosTokenStream(IndexReader reader, int docid,
+ String fieldName, int slop) throws IOException {
+ return new PosTokenStream(reader.document(docid).getFieldable(fieldName).stringValue(),
+ PosOffsetMap.this, slop);
+ }
+
}
\ No newline at end of file
Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/poshighlight/PosCollector.java
===================================================================
--- lucene/contrib/highlighter/src/java/org/apache/lucene/search/poshighlight/PosCollector.java (revision 1150180)
+++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/poshighlight/PosCollector.java (working copy)
@@ -56,17 +56,17 @@
}
}
- private boolean addDoc (int doc) {
+ private ScorePosDoc addDoc (int doc) {
if (count <= 0 || docs[count-1].doc != doc) {
ScorePosDoc spdoc = new ScorePosDoc (doc);
docs[count++] = spdoc;
- return true;
+ return spdoc;
}
- return false;
+ return null;
}
public boolean acceptsDocsOutOfOrder() {
- return false;
+ return true;
}
public void setScorer(Scorer scorer) throws IOException {
@@ -91,12 +91,15 @@
@Override
public boolean needsPositions() { return true; }
+
+ @Override
+ public boolean needsPayloads() { return true; }
@Override
public void collectLeafPosition(Scorer scorer, PositionInterval interval,
- int docID) {
- addDoc(docID);
- docs[count - 1].storePosition(interval);
+ int docID) throws IOException {
+ addDoc(docID);
+ docs[count - 1].storePosition(interval);
}
@Override
Index: lucene/contrib/highlighter/src/test/org/apache/lucene/search/poshighlight/OffsetPayloadWriter.java
===================================================================
--- lucene/contrib/highlighter/src/test/org/apache/lucene/search/poshighlight/OffsetPayloadWriter.java (revision 0)
+++ lucene/contrib/highlighter/src/test/org/apache/lucene/search/poshighlight/OffsetPayloadWriter.java (revision 0)
@@ -0,0 +1,67 @@
+package org.apache.lucene.search.poshighlight;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
+import org.apache.lucene.index.Payload;
+
+/**
+ * Writes offsets to a payload.
+ *
+ * @lucene.experimental
+ */
+
+public class OffsetPayloadWriter extends TokenFilter {
+
+ private OffsetAttribute offsetAtt;
+ private PayloadAttribute payloadAtt;
+ private Payload payload = new Payload (new byte[6]);
+
+ public OffsetPayloadWriter(TokenStream input) {
+ super(input);
+ offsetAtt = addAttribute(OffsetAttribute.class);
+ payloadAtt = addAttribute(PayloadAttribute.class);
+ }
+
+ @Override
+ public boolean incrementToken() throws IOException {
+ if (! input.incrementToken())
+ return false;
+ byte[] payloadData = payload.getData();
+ int startOffset = offsetAtt.startOffset();
+
+ // do we really need 4 bytes here? 3 might be enough
+ payloadData[3] = (byte) (startOffset >> 24);
+ payloadData[2] = (byte) ((startOffset >> 16) & 0xff);
+ payloadData[1] = (byte) ((startOffset >> 8) & 0xff);
+ payloadData[0] = (byte) (startOffset & 0xff);
+
+ // and 2 bytes for the token length seems big too? maybe 1 would suffice?
+ int tokenSize = offsetAtt.endOffset() - startOffset;
+ payloadData[5] = (byte) ((tokenSize >> 8) & 0xff);
+ payloadData[4] = (byte) (tokenSize & 0xff);
+ payloadAtt.setPayload(payload);
+ return true;
+ }
+
+}
Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/poshighlight/ScorePosDoc.java
===================================================================
--- lucene/contrib/highlighter/src/java/org/apache/lucene/search/poshighlight/ScorePosDoc.java (revision 1150180)
+++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/poshighlight/ScorePosDoc.java (working copy)
@@ -17,50 +17,46 @@
* limitations under the License.
*/
-import java.util.Comparator;
+import java.io.IOException;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.positions.PositionIntervalIterator.PositionInterval;
-import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.BytesRef;
-/** Used to accumulate position intervals while scoring
+/** Used to accumulate position intervals while scoring
+ *
* @lucene.experimental
*/
public class ScorePosDoc extends ScoreDoc {
- public int posCount = 0;
- public PositionInterval[] positions;
+ private PosOffsetMap positionMap;
+ private BytesRef bytes = new BytesRef(6);
public ScorePosDoc(int doc) {
super(doc, 0);
- positions = new PositionInterval[32];
+ this.positionMap = new PosOffsetMap();
}
- public void storePosition (PositionInterval pos) {
- ensureStorage();
- positions[posCount++] = (PositionInterval) pos.clone();
+ public PosOffsetMap getPositionMap () {
+ return positionMap;
}
- private void ensureStorage () {
- if (posCount >= positions.length) {
- PositionInterval temp[] = new PositionInterval[positions.length * 2];
- System.arraycopy(positions, 0, temp, 0, positions.length);
- positions = temp;
+ public void storePosition (PositionInterval interval) throws IOException {
+ if (positionMap.containsKey(interval.begin))
+ // This test is needed b/c sometimes (ConjunctionPosIterator) intervals are reported twice;
+ // once in call to advanceTo() (see PosCollector.collect())
+ // and then again while iterating over remaining positions explicitly.
+ // And we need to avoid reprocessing since the second time through,
+ // the payload(s) will already have been consumed...
+ return;
+ if (interval.payloadAvailable()) {
+ interval.nextPayload(bytes);
+ int start = (bytes.bytes[0] & 0xff) | ((bytes.bytes[1] & 0xff) << 8) | ((bytes.bytes[2] & 0xff) << 16) | ((bytes.bytes[3] & 0xff) << 24);
+ int tokenSize = (bytes.bytes[4] & 0xff) | ((bytes.bytes[5] & 0xff) << 8);
+ positionMap.put(interval.begin, interval.end, start, start + tokenSize);
+ } else {
+ positionMap.put(interval.begin, interval.end, 0, 0);
}
}
- public PositionInterval[] sortedPositions() {
- ArrayUtil.mergeSort(positions, 0, posCount, new Comparator() {
- public int compare(PositionInterval o1, PositionInterval o2) {
- return
- o1.begin < o2.begin ? -1 :
- (o1.begin > o2.begin ? 1 :
- (o1.end < o2.end ? -1 :
- (o1.end > o2.end ? 1 :
- 0)));
- }
-
- });
- return positions;
- }
}
Index: lucene/contrib/highlighter/src/test/org/apache/lucene/search/poshighlight/OffsetPayloadReader.java
===================================================================
--- lucene/contrib/highlighter/src/test/org/apache/lucene/search/poshighlight/OffsetPayloadReader.java (revision 0)
+++ lucene/contrib/highlighter/src/test/org/apache/lucene/search/poshighlight/OffsetPayloadReader.java (revision 0)
@@ -0,0 +1,60 @@
+package org.apache.lucene.search.poshighlight;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
+
+/**
+ * Reads offsets from a payload.
+ *
+ * @lucene.experimental
+ */
+
+public class OffsetPayloadReader extends TokenFilter {
+
+ private OffsetAttribute offsetAtt;
+ private PayloadAttribute payloadAtt;
+
+ protected OffsetPayloadReader(TokenStream input) {
+ super(input);
+ offsetAtt = addAttribute(OffsetAttribute.class);
+ payloadAtt = addAttribute(PayloadAttribute.class);
+ }
+
+ @Override
+ public boolean incrementToken() throws IOException {
+ if (! input.incrementToken())
+ return false;
+
+ byte[] payloadData = payloadAtt.getPayload().getData();
+ int po = payloadAtt.getPayload().getOffset();
+
+ int startOffset = payloadData[po] | (payloadData[po+1] << 8) | (payloadData[po+2] << 16) | (payloadData[po+3] << 24);
+ int tokenSize = payloadData[po+4] | (payloadData[po+5] << 8);
+
+ offsetAtt.setOffset(startOffset, startOffset + tokenSize);
+
+ return true;
+ }
+
+}
Index: lucene/src/java/org/apache/lucene/search/positions/IntervalQueueOr.java
===================================================================
--- lucene/src/java/org/apache/lucene/search/positions/IntervalQueueOr.java (revision 1150180)
+++ lucene/src/java/org/apache/lucene/search/positions/IntervalQueueOr.java (working copy)
@@ -15,6 +15,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
+import org.apache.lucene.search.positions.PositionIntervalIterator.PositionCollector;
import org.apache.lucene.search.positions.PositionIntervalIterator.PositionInterval;
/**
*
Index: lucene/src/java/org/apache/lucene/search/positions/WithinPositionIterator.java
===================================================================
--- lucene/src/java/org/apache/lucene/search/positions/WithinPositionIterator.java (revision 1150180)
+++ lucene/src/java/org/apache/lucene/search/positions/WithinPositionIterator.java (working copy)
@@ -57,7 +57,7 @@
}
@Override
- public void collect() {
+ public void collect() throws IOException {
collector.collectComposite(null, interval, iterator.docID());
iterator.collect();
}
Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/poshighlight/PosHighlighter.java
===================================================================
--- lucene/contrib/highlighter/src/java/org/apache/lucene/search/poshighlight/PosHighlighter.java (revision 0)
+++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/poshighlight/PosHighlighter.java (revision 0)
@@ -0,0 +1,49 @@
+package org.apache.lucene.search.poshighlight;
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.MultiTermQuery;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.highlight.Encoder;
+import org.apache.lucene.search.highlight.Formatter;
+import org.apache.lucene.search.highlight.Highlighter;
+
+public class PosHighlighter extends Highlighter {
+
+ public PosHighlighter(Formatter formatter, Encoder encoder, PosScorer fragmentScorer) {
+ super(formatter, encoder, fragmentScorer);
+ }
+
+ public PosHighlighter(Formatter formatter, PosScorer fragmentScorer) {
+ super(formatter, fragmentScorer);
+ }
+
+ public PosHighlighter (PosScorer fragmentScorer) {
+ super(fragmentScorer);
+ }
+
+ public PosHighlighter () {
+ super (new PosScorer());
+ }
+
+ public static TokenStream getPosTokenStream (Query q, final int docid, String termVectorField, IndexSearcher searcher, String text) throws IOException {
+ PosCollector collector = new PosCollector(1);
+ if (q instanceof MultiTermQuery) {
+ ((MultiTermQuery)q).setRewriteMethod (MultiTermQuery.CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE);
+ }
+ searcher.search(q, new SingleDocFilter(docid), collector);
+ ScorePosDoc doc = collector.docs[0];
+ if (doc == null)
+ return null;
+
+ if (termVectorField != null) {
+ doc.getPositionMap().getTermVectorOffsets(searcher.getIndexReader(), doc.doc, termVectorField);
+ }
+
+ TokenStream tstream = new PosTokenStream(text, doc.getPositionMap(), 25);
+ return tstream;
+ }
+
+}