Index: solr/core/src/test/org/apache/solr/highlight/HighlighterTest.java =================================================================== --- solr/core/src/test/org/apache/solr/highlight/HighlighterTest.java (revision 1399152) +++ solr/core/src/test/org/apache/solr/highlight/HighlighterTest.java (working copy) @@ -150,6 +150,25 @@ "//arr[@name='tv_text']/str[.=' long fragments.']" ); } + + @Test + public void testTermVectorWithoutOffsetsHighlight() { + + HashMap args = new HashMap(); + args.put("hl", "true"); + args.put("hl.fl", "tv_no_off_text"); + + TestHarness.LocalRequestFactory sumLRF = h.getRequestFactory("standard", 0, 200, args); + + assertU(adoc("tv_no_off_text", "Crackerjack Cameron", "id", "1")); + assertU(commit()); + assertU(optimize()); + + assertQ("Fields with term vectors switched on but no offsets should be correctly highlighted", + sumLRF.makeRequest("tv_no_off_text:cameron"), + "//arr[@name='tv_no_off_text']/str[.='Crackerjack Cameron']"); + + } @Test public void testTermOffsetsTokenStream() throws Exception { Index: solr/core/src/test-files/solr/collection1/conf/schema.xml =================================================================== --- solr/core/src/test-files/solr/collection1/conf/schema.xml (revision 1399152) +++ solr/core/src/test-files/solr/collection1/conf/schema.xml (working copy) @@ -649,6 +649,8 @@ termVectors="true" termPositions="true" termOffsets="true"/> + Index: solr/core/src/java/org/apache/solr/highlight/DefaultSolrHighlighter.java =================================================================== --- solr/core/src/java/org/apache/solr/highlight/DefaultSolrHighlighter.java (revision 1399152) +++ solr/core/src/java/org/apache/solr/highlight/DefaultSolrHighlighter.java (working copy) @@ -16,18 +16,6 @@ */ package org.apache.solr.highlight; -import java.io.IOException; -import java.io.StringReader; -import java.util.ArrayList; -import java.util.Collections; -import java.util.Comparator; -import java.util.HashSet; -import java.util.LinkedList; -import java.util.List; -import java.util.ListIterator; -import java.util.Set; - - import org.apache.lucene.analysis.CachingTokenFilter; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; @@ -38,19 +26,16 @@ import org.apache.lucene.index.StoredDocument; import org.apache.lucene.search.Query; import org.apache.lucene.search.highlight.*; -import org.apache.lucene.search.vectorhighlight.BoundaryScanner; -import org.apache.lucene.search.vectorhighlight.FastVectorHighlighter; -import org.apache.lucene.search.vectorhighlight.FieldQuery; -import org.apache.lucene.search.vectorhighlight.FragListBuilder; -import org.apache.lucene.search.vectorhighlight.FragmentsBuilder; +import org.apache.lucene.search.highlight.Formatter; +import org.apache.lucene.search.vectorhighlight.*; import org.apache.lucene.util.AttributeSource.State; import org.apache.solr.common.SolrException; import org.apache.solr.common.params.HighlightParams; import org.apache.solr.common.params.SolrParams; import org.apache.solr.common.util.NamedList; import org.apache.solr.common.util.SimpleOrderedMap; -import org.apache.solr.core.SolrConfig; import org.apache.solr.core.PluginInfo; +import org.apache.solr.core.SolrConfig; import org.apache.solr.core.SolrCore; import org.apache.solr.request.SolrQueryRequest; import org.apache.solr.schema.IndexSchema; @@ -62,6 +47,10 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.io.IOException; +import java.io.StringReader; +import java.util.*; + /** * * @since solr 1.3 @@ -462,15 +451,10 @@ List frags = new ArrayList(); TermOffsetsTokenStream tots = null; // to be non-null iff we're using TermOffsets optimization - try { - TokenStream tvStream = TokenSources.getTokenStream(searcher.getIndexReader(), docId, fieldName); - if (tvStream != null) { - tots = new TermOffsetsTokenStream(tvStream); - } + TokenStream tvStream = TokenSources.getTokenStreamWithOffsets(searcher.getIndexReader(), docId, fieldName); + if (tvStream != null) { + tots = new TermOffsetsTokenStream(tvStream); } - catch (IllegalArgumentException e) { - // No problem. But we can't use TermOffsets optimization. - } for (int j = 0; j < docTexts.length; j++) { if( tots != null ) { Index: lucene/CHANGES.txt =================================================================== --- lucene/CHANGES.txt (revision 1399152) +++ lucene/CHANGES.txt (working copy) @@ -49,6 +49,10 @@ * LUCENE-4399: Deprecated AppendingCodec. Lucene's term dictionaries no longer seek when writing. (Adrien Grand, Robert Muir) +* LUCENE-4479: Rename TokenStream.getTokenStream(IndexReader, int, String) + to TokenStream.getTokenStreamWithOffsets, and return null on failure + rather than throwing IllegalArgumentException. (Alan Woodward) + Bug Fixes * LUCENE-1822: BaseFragListBuilder hard-coded 6 char margin is too naive. @@ -60,6 +64,9 @@ * LUCENE-4485: When CheckIndex terms, terms/docs pairs and tokens, these counts now all exclude deleted documents. (Mike McCandless) +* LUCENE-4479: Highlighter works correctly for fields with term vector + positions, but no offsets. (Alan Woodward) + Optimizations * LUCENE-4443: BlockPostingsFormat no longer writes unnecessary offsets Index: lucene/highlighter/src/test/org/apache/lucene/search/highlight/TokenSourcesTest.java =================================================================== --- lucene/highlighter/src/test/org/apache/lucene/search/highlight/TokenSourcesTest.java (revision 1399152) +++ lucene/highlighter/src/test/org/apache/lucene/search/highlight/TokenSourcesTest.java (working copy) @@ -17,8 +17,6 @@ * limitations under the License. */ -import java.io.IOException; - import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; @@ -42,6 +40,8 @@ import org.apache.lucene.store.Directory; import org.apache.lucene.util.LuceneTestCase; +import java.io.IOException; + // LUCENE-2874 public class TokenSourcesTest extends LuceneTestCase { private static final String FIELD = "text"; @@ -260,4 +260,40 @@ } } + public void testTermVectorWithoutOffsetsThrowsException() + throws IOException, InvalidTokenOffsetsException { + final String TEXT = "the fox did not jump"; + final Directory directory = newDirectory(); + final IndexWriter indexWriter = new IndexWriter(directory, + newIndexWriterConfig(TEST_VERSION_CURRENT, null)); + try { + final Document document = new Document(); + FieldType customType = new FieldType(TextField.TYPE_NOT_STORED); + customType.setStoreTermVectors(true); + customType.setStoreTermVectorOffsets(false); + customType.setStoreTermVectorPositions(true); + document.add(new Field(FIELD, new OverlappingTokenStream(), customType)); + indexWriter.addDocument(document); + } finally { + indexWriter.close(); + } + final IndexReader indexReader = DirectoryReader.open(directory); + try { + assertEquals(1, indexReader.numDocs()); + final TokenStream tokenStream = TokenSources + .getTokenStream( + indexReader.getTermVector(0, FIELD), + false); + fail("TokenSources.getTokenStream should throw IllegalArgumentException if term vector has no offsets"); + } + catch (IllegalArgumentException e) { + // expected + } + finally { + indexReader.close(); + directory.close(); + } + } + + } Index: lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java =================================================================== --- lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java (revision 1399152) +++ lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java (working copy) @@ -121,14 +121,10 @@ return getTokenStream(vector, false); } - private static boolean hasPositions(Terms vector) throws IOException { - return vector.hasPositions(); - } - /** - * Low level api. Returns a token stream or null if no offset info available - * in index. This can be used to feed the highlighter with a pre-parsed token - * stream + * Low level api. Returns a token stream generated from a {@link Terms}. This + * can be used to feed the highlighter with a pre-parsed token + * stream. The {@link Terms} must have offsets available. * * In my tests the speeds to recreate 1000 token streams using this method * are: - with TermVector offset only data stored - 420 milliseconds - with @@ -149,11 +145,18 @@ * @param tokenPositionsGuaranteedContiguous true if the token position * numbers have no overlaps or gaps. If looking to eek out the last * drops of performance, set to true. If in doubt, set to false. + * + * @throws IllegalArgumentException if no offsets are available */ public static TokenStream getTokenStream(Terms tpv, boolean tokenPositionsGuaranteedContiguous) throws IOException { - if (!tokenPositionsGuaranteedContiguous && hasPositions(tpv)) { + + if (!tpv.hasOffsets()) { + throw new IllegalArgumentException("Cannot create TokenStream from Terms without offsets"); + } + + if (!tokenPositionsGuaranteedContiguous && tpv.hasPositions()) { return new TokenStreamFromTermPositionVector(tpv); } @@ -261,24 +264,31 @@ return new StoredTokenStream(tokensInOriginalOrder); } - public static TokenStream getTokenStream(IndexReader reader, int docId, - String field) throws IOException { + /** + * Returns a {@link TokenStream} with positions and offsets constructed from + * field termvectors. If the field has no termvectors, or positions or offsets + * are not included in the termvector, return null. + * @param reader the {@link IndexReader} to retrieve term vectors from + * @param docId the document to retrieve termvectors for + * @param field the field to retrieve termvectors for + * @return a {@link TokenStream}, or null if positions and offsets are not available + * @throws IOException + */ + public static TokenStream getTokenStreamWithOffsets(IndexReader reader, int docId, + String field) throws IOException { Fields vectors = reader.getTermVectors(docId); if (vectors == null) { - throw new IllegalArgumentException(field + " in doc #" + docId - + "does not have any term position data stored"); + return null; } Terms vector = vectors.terms(field); if (vector == null) { - throw new IllegalArgumentException(field + " in doc #" + docId - + "does not have any term position data stored"); + return null; } - if (!hasPositions(vector)) { - throw new IllegalArgumentException(field + " in doc #" + docId - + "does not have any term position data stored"); + if (!vector.hasPositions() || !vector.hasOffsets()) { + return null; } return getTokenStream(vector);