Index: solr/core/src/test/org/apache/solr/highlight/HighlighterTest.java
===================================================================
--- solr/core/src/test/org/apache/solr/highlight/HighlighterTest.java (revision 1399152)
+++ solr/core/src/test/org/apache/solr/highlight/HighlighterTest.java (working copy)
@@ -150,6 +150,25 @@
"//arr[@name='tv_text']/str[.=' long fragments.']"
);
}
+
+ @Test
+ public void testTermVectorWithoutOffsetsHighlight() {
+
+ HashMap args = new HashMap();
+ args.put("hl", "true");
+ args.put("hl.fl", "tv_no_off_text");
+
+ TestHarness.LocalRequestFactory sumLRF = h.getRequestFactory("standard", 0, 200, args);
+
+ assertU(adoc("tv_no_off_text", "Crackerjack Cameron", "id", "1"));
+ assertU(commit());
+ assertU(optimize());
+
+ assertQ("Fields with term vectors switched on but no offsets should be correctly highlighted",
+ sumLRF.makeRequest("tv_no_off_text:cameron"),
+ "//arr[@name='tv_no_off_text']/str[.='Crackerjack Cameron']");
+
+ }
@Test
public void testTermOffsetsTokenStream() throws Exception {
Index: solr/core/src/test-files/solr/collection1/conf/schema.xml
===================================================================
--- solr/core/src/test-files/solr/collection1/conf/schema.xml (revision 1399152)
+++ solr/core/src/test-files/solr/collection1/conf/schema.xml (working copy)
@@ -649,6 +649,8 @@
termVectors="true" termPositions="true" termOffsets="true"/>
+
Index: solr/core/src/java/org/apache/solr/highlight/DefaultSolrHighlighter.java
===================================================================
--- solr/core/src/java/org/apache/solr/highlight/DefaultSolrHighlighter.java (revision 1399152)
+++ solr/core/src/java/org/apache/solr/highlight/DefaultSolrHighlighter.java (working copy)
@@ -16,18 +16,6 @@
*/
package org.apache.solr.highlight;
-import java.io.IOException;
-import java.io.StringReader;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.Comparator;
-import java.util.HashSet;
-import java.util.LinkedList;
-import java.util.List;
-import java.util.ListIterator;
-import java.util.Set;
-
-
import org.apache.lucene.analysis.CachingTokenFilter;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
@@ -38,19 +26,16 @@
import org.apache.lucene.index.StoredDocument;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.highlight.*;
-import org.apache.lucene.search.vectorhighlight.BoundaryScanner;
-import org.apache.lucene.search.vectorhighlight.FastVectorHighlighter;
-import org.apache.lucene.search.vectorhighlight.FieldQuery;
-import org.apache.lucene.search.vectorhighlight.FragListBuilder;
-import org.apache.lucene.search.vectorhighlight.FragmentsBuilder;
+import org.apache.lucene.search.highlight.Formatter;
+import org.apache.lucene.search.vectorhighlight.*;
import org.apache.lucene.util.AttributeSource.State;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.params.HighlightParams;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.SimpleOrderedMap;
-import org.apache.solr.core.SolrConfig;
import org.apache.solr.core.PluginInfo;
+import org.apache.solr.core.SolrConfig;
import org.apache.solr.core.SolrCore;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.schema.IndexSchema;
@@ -62,6 +47,10 @@
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.*;
+
/**
*
* @since solr 1.3
@@ -462,15 +451,10 @@
List frags = new ArrayList();
TermOffsetsTokenStream tots = null; // to be non-null iff we're using TermOffsets optimization
- try {
- TokenStream tvStream = TokenSources.getTokenStream(searcher.getIndexReader(), docId, fieldName);
- if (tvStream != null) {
- tots = new TermOffsetsTokenStream(tvStream);
- }
+ TokenStream tvStream = TokenSources.getTokenStreamWithOffsets(searcher.getIndexReader(), docId, fieldName);
+ if (tvStream != null) {
+ tots = new TermOffsetsTokenStream(tvStream);
}
- catch (IllegalArgumentException e) {
- // No problem. But we can't use TermOffsets optimization.
- }
for (int j = 0; j < docTexts.length; j++) {
if( tots != null ) {
Index: lucene/CHANGES.txt
===================================================================
--- lucene/CHANGES.txt (revision 1399152)
+++ lucene/CHANGES.txt (working copy)
@@ -49,6 +49,10 @@
* LUCENE-4399: Deprecated AppendingCodec. Lucene's term dictionaries
no longer seek when writing. (Adrien Grand, Robert Muir)
+* LUCENE-4479: Rename TokenStream.getTokenStream(IndexReader, int, String)
+ to TokenStream.getTokenStreamWithOffsets, and return null on failure
+ rather than throwing IllegalArgumentException. (Alan Woodward)
+
Bug Fixes
* LUCENE-1822: BaseFragListBuilder hard-coded 6 char margin is too naive.
@@ -60,6 +64,9 @@
* LUCENE-4485: When CheckIndex terms, terms/docs pairs and tokens,
these counts now all exclude deleted documents. (Mike McCandless)
+* LUCENE-4479: Highlighter works correctly for fields with term vector
+ positions, but no offsets. (Alan Woodward)
+
Optimizations
* LUCENE-4443: BlockPostingsFormat no longer writes unnecessary offsets
Index: lucene/highlighter/src/test/org/apache/lucene/search/highlight/TokenSourcesTest.java
===================================================================
--- lucene/highlighter/src/test/org/apache/lucene/search/highlight/TokenSourcesTest.java (revision 1399152)
+++ lucene/highlighter/src/test/org/apache/lucene/search/highlight/TokenSourcesTest.java (working copy)
@@ -17,8 +17,6 @@
* limitations under the License.
*/
-import java.io.IOException;
-
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
@@ -42,6 +40,8 @@
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.LuceneTestCase;
+import java.io.IOException;
+
// LUCENE-2874
public class TokenSourcesTest extends LuceneTestCase {
private static final String FIELD = "text";
@@ -260,4 +260,40 @@
}
}
+ public void testTermVectorWithoutOffsetsThrowsException()
+ throws IOException, InvalidTokenOffsetsException {
+ final String TEXT = "the fox did not jump";
+ final Directory directory = newDirectory();
+ final IndexWriter indexWriter = new IndexWriter(directory,
+ newIndexWriterConfig(TEST_VERSION_CURRENT, null));
+ try {
+ final Document document = new Document();
+ FieldType customType = new FieldType(TextField.TYPE_NOT_STORED);
+ customType.setStoreTermVectors(true);
+ customType.setStoreTermVectorOffsets(false);
+ customType.setStoreTermVectorPositions(true);
+ document.add(new Field(FIELD, new OverlappingTokenStream(), customType));
+ indexWriter.addDocument(document);
+ } finally {
+ indexWriter.close();
+ }
+ final IndexReader indexReader = DirectoryReader.open(directory);
+ try {
+ assertEquals(1, indexReader.numDocs());
+ final TokenStream tokenStream = TokenSources
+ .getTokenStream(
+ indexReader.getTermVector(0, FIELD),
+ false);
+ fail("TokenSources.getTokenStream should throw IllegalArgumentException if term vector has no offsets");
+ }
+ catch (IllegalArgumentException e) {
+ // expected
+ }
+ finally {
+ indexReader.close();
+ directory.close();
+ }
+ }
+
+
}
Index: lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java
===================================================================
--- lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java (revision 1399152)
+++ lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java (working copy)
@@ -121,14 +121,10 @@
return getTokenStream(vector, false);
}
- private static boolean hasPositions(Terms vector) throws IOException {
- return vector.hasPositions();
- }
-
/**
- * Low level api. Returns a token stream or null if no offset info available
- * in index. This can be used to feed the highlighter with a pre-parsed token
- * stream
+ * Low level api. Returns a token stream generated from a {@link Terms}. This
+ * can be used to feed the highlighter with a pre-parsed token
+ * stream. The {@link Terms} must have offsets available.
*
* In my tests the speeds to recreate 1000 token streams using this method
* are: - with TermVector offset only data stored - 420 milliseconds - with
@@ -149,11 +145,18 @@
* @param tokenPositionsGuaranteedContiguous true if the token position
* numbers have no overlaps or gaps. If looking to eek out the last
* drops of performance, set to true. If in doubt, set to false.
+ *
+ * @throws IllegalArgumentException if no offsets are available
*/
public static TokenStream getTokenStream(Terms tpv,
boolean tokenPositionsGuaranteedContiguous)
throws IOException {
- if (!tokenPositionsGuaranteedContiguous && hasPositions(tpv)) {
+
+ if (!tpv.hasOffsets()) {
+ throw new IllegalArgumentException("Cannot create TokenStream from Terms without offsets");
+ }
+
+ if (!tokenPositionsGuaranteedContiguous && tpv.hasPositions()) {
return new TokenStreamFromTermPositionVector(tpv);
}
@@ -261,24 +264,31 @@
return new StoredTokenStream(tokensInOriginalOrder);
}
- public static TokenStream getTokenStream(IndexReader reader, int docId,
- String field) throws IOException {
+ /**
+ * Returns a {@link TokenStream} with positions and offsets constructed from
+ * field termvectors. If the field has no termvectors, or positions or offsets
+ * are not included in the termvector, return null.
+ * @param reader the {@link IndexReader} to retrieve term vectors from
+ * @param docId the document to retrieve termvectors for
+ * @param field the field to retrieve termvectors for
+ * @return a {@link TokenStream}, or null if positions and offsets are not available
+ * @throws IOException
+ */
+ public static TokenStream getTokenStreamWithOffsets(IndexReader reader, int docId,
+ String field) throws IOException {
Fields vectors = reader.getTermVectors(docId);
if (vectors == null) {
- throw new IllegalArgumentException(field + " in doc #" + docId
- + "does not have any term position data stored");
+ return null;
}
Terms vector = vectors.terms(field);
if (vector == null) {
- throw new IllegalArgumentException(field + " in doc #" + docId
- + "does not have any term position data stored");
+ return null;
}
- if (!hasPositions(vector)) {
- throw new IllegalArgumentException(field + " in doc #" + docId
- + "does not have any term position data stored");
+ if (!vector.hasPositions() || !vector.hasOffsets()) {
+ return null;
}
return getTokenStream(vector);