diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java index b187a7c..7a16601 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java @@ -20,11 +20,6 @@ package org.apache.lucene.search.highlight; * limitations under the License. */ -import java.io.IOException; -import java.io.StringReader; -import java.util.ArrayList; -import java.util.Comparator; - import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; @@ -32,15 +27,15 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.document.Document; -import org.apache.lucene.index.DocsAndPositionsEnum; -import org.apache.lucene.index.Fields; -import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.Terms; -import org.apache.lucene.index.TermsEnum; -import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.index.*; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BytesRef; +import java.io.IOException; +import java.io.StringReader; +import java.util.ArrayList; +import java.util.Comparator; + /** * Hides implementation issues associated with obtaining a TokenStream for use * with the higlighter - can obtain from TermFreqVectors with offsets and @@ -120,14 +115,10 @@ public class TokenSources { return getTokenStream(vector, false); } - private static boolean hasPositions(Terms vector) throws IOException { - return vector.hasPositions(); - } - /** - * Low level api. Returns a token stream or null if no offset info available - * in index. This can be used to feed the highlighter with a pre-parsed token - * stream + * Low level api. Returns a token stream generated from a {@link Terms}. This + * can be used to feed the highlighter with a pre-parsed token + * stream. The {@link Terms} must have offsets available. * * In my tests the speeds to recreate 1000 token streams using this method * are: - with TermVector offset only data stored - 420 milliseconds - with @@ -148,11 +139,18 @@ public class TokenSources { * @param tokenPositionsGuaranteedContiguous true if the token position * numbers have no overlaps or gaps. If looking to eek out the last * drops of performance, set to true. If in doubt, set to false. + * + * @throws IllegalArgumentException if no offsets are available */ public static TokenStream getTokenStream(Terms tpv, boolean tokenPositionsGuaranteedContiguous) throws IOException { - if (!tokenPositionsGuaranteedContiguous && hasPositions(tpv)) { + + if (!tpv.hasOffsets()) { + throw new IllegalArgumentException("Cannot create TokenStream from Terms without offsets"); + } + + if (!tokenPositionsGuaranteedContiguous && tpv.hasPositions()) { return new TokenStreamFromTermPositionVector(tpv); } @@ -260,24 +258,31 @@ public class TokenSources { return new StoredTokenStream(tokensInOriginalOrder); } - public static TokenStream getTokenStream(IndexReader reader, int docId, - String field) throws IOException { + /** + * Returns a {@link TokenStream} with positions and offsets constructed from + * field termvectors. If the field has no termvectors, or positions or offsets + * are not included in the termvector, return null. + * @param reader the {@link IndexReader} to retrieve term vectors from + * @param docId the document to retrieve termvectors for + * @param field the field to retrieve termvectors for + * @return a {@link TokenStream}, or null if positions and offsets are not available + * @throws IOException + */ + public static TokenStream getTokenStreamWithOffsets(IndexReader reader, int docId, + String field) throws IOException { Fields vectors = reader.getTermVectors(docId); if (vectors == null) { - throw new IllegalArgumentException(field + " in doc #" + docId - + "does not have any term position data stored"); + return null; } Terms vector = vectors.terms(field); if (vector == null) { - throw new IllegalArgumentException(field + " in doc #" + docId - + "does not have any term position data stored"); + return null; } - if (!hasPositions(vector)) { - throw new IllegalArgumentException(field + " in doc #" + docId - + "does not have any term position data stored"); + if (!vector.hasPositions() || !vector.hasOffsets()) { + return null; } return getTokenStream(vector); diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/highlight/TokenSourcesTest.java b/lucene/highlighter/src/test/org/apache/lucene/search/highlight/TokenSourcesTest.java index 8e8bea2..00cc34f 100644 --- a/lucene/highlighter/src/test/org/apache/lucene/search/highlight/TokenSourcesTest.java +++ b/lucene/highlighter/src/test/org/apache/lucene/search/highlight/TokenSourcesTest.java @@ -17,8 +17,6 @@ package org.apache.lucene.search.highlight; * limitations under the License. */ -import java.io.IOException; - import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; @@ -42,6 +40,8 @@ import org.apache.lucene.search.spans.SpanTermQuery; import org.apache.lucene.store.Directory; import org.apache.lucene.util.LuceneTestCase; +import java.io.IOException; + // LUCENE-2874 public class TokenSourcesTest extends LuceneTestCase { private static final String FIELD = "text"; @@ -260,4 +260,40 @@ public class TokenSourcesTest extends LuceneTestCase { } } + public void testTermVectorWithoutOffsetsThrowsException() + throws IOException, InvalidTokenOffsetsException { + final String TEXT = "the fox did not jump"; + final Directory directory = newDirectory(); + final IndexWriter indexWriter = new IndexWriter(directory, + newIndexWriterConfig(TEST_VERSION_CURRENT, null)); + try { + final Document document = new Document(); + FieldType customType = new FieldType(TextField.TYPE_NOT_STORED); + customType.setStoreTermVectors(true); + customType.setStoreTermVectorOffsets(false); + customType.setStoreTermVectorPositions(true); + document.add(new Field(FIELD, new OverlappingTokenStream(), customType)); + indexWriter.addDocument(document); + } finally { + indexWriter.close(); + } + final IndexReader indexReader = DirectoryReader.open(directory); + try { + assertEquals(1, indexReader.numDocs()); + final TokenStream tokenStream = TokenSources + .getTokenStream( + indexReader.getTermVector(0, FIELD), + false); + fail("TokenSources.getTokenStream should throw IllegalArgumentException if term vector has no offsets"); + } + catch (IllegalArgumentException e) { + // expected + } + finally { + indexReader.close(); + directory.close(); + } + } + + } diff --git a/solr/core/src/java/org/apache/solr/highlight/DefaultSolrHighlighter.java b/solr/core/src/java/org/apache/solr/highlight/DefaultSolrHighlighter.java index efa8e80..4872868 100644 --- a/solr/core/src/java/org/apache/solr/highlight/DefaultSolrHighlighter.java +++ b/solr/core/src/java/org/apache/solr/highlight/DefaultSolrHighlighter.java @@ -16,18 +16,6 @@ */ package org.apache.solr.highlight; -import java.io.IOException; -import java.io.StringReader; -import java.util.ArrayList; -import java.util.Collections; -import java.util.Comparator; -import java.util.HashSet; -import java.util.LinkedList; -import java.util.List; -import java.util.ListIterator; -import java.util.Set; - - import org.apache.lucene.analysis.CachingTokenFilter; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; @@ -36,19 +24,16 @@ import org.apache.lucene.document.Document; import org.apache.lucene.index.IndexableField; import org.apache.lucene.search.Query; import org.apache.lucene.search.highlight.*; -import org.apache.lucene.search.vectorhighlight.BoundaryScanner; -import org.apache.lucene.search.vectorhighlight.FastVectorHighlighter; -import org.apache.lucene.search.vectorhighlight.FieldQuery; -import org.apache.lucene.search.vectorhighlight.FragListBuilder; -import org.apache.lucene.search.vectorhighlight.FragmentsBuilder; +import org.apache.lucene.search.highlight.Formatter; +import org.apache.lucene.search.vectorhighlight.*; import org.apache.lucene.util.AttributeSource.State; import org.apache.solr.common.SolrException; import org.apache.solr.common.params.HighlightParams; import org.apache.solr.common.params.SolrParams; import org.apache.solr.common.util.NamedList; import org.apache.solr.common.util.SimpleOrderedMap; -import org.apache.solr.core.SolrConfig; import org.apache.solr.core.PluginInfo; +import org.apache.solr.core.SolrConfig; import org.apache.solr.core.SolrCore; import org.apache.solr.request.SolrQueryRequest; import org.apache.solr.schema.IndexSchema; @@ -60,6 +45,10 @@ import org.apache.solr.util.plugin.PluginInfoInitialized; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.io.IOException; +import java.io.StringReader; +import java.util.*; + /** * * @since solr 1.3 @@ -457,14 +446,9 @@ public class DefaultSolrHighlighter extends SolrHighlighter implements PluginInf List frags = new ArrayList(); TermOffsetsTokenStream tots = null; // to be non-null iff we're using TermOffsets optimization - try { - TokenStream tvStream = TokenSources.getTokenStream(searcher.getIndexReader(), docId, fieldName); - if (tvStream != null) { - tots = new TermOffsetsTokenStream(tvStream); - } - } - catch (IllegalArgumentException e) { - // No problem. But we can't use TermOffsets optimization. + TokenStream tvStream = TokenSources.getTokenStreamWithOffsets(searcher.getIndexReader(), docId, fieldName); + if (tvStream != null) { + tots = new TermOffsetsTokenStream(tvStream); } for (int j = 0; j < docTexts.length; j++) { diff --git a/solr/core/src/test-files/solr/collection1/conf/schema.xml b/solr/core/src/test-files/solr/collection1/conf/schema.xml index 6b67f50..c57cba9 100644 --- a/solr/core/src/test-files/solr/collection1/conf/schema.xml +++ b/solr/core/src/test-files/solr/collection1/conf/schema.xml @@ -649,6 +649,8 @@ termVectors="true" termPositions="true" termOffsets="true"/> + diff --git a/solr/core/src/test/org/apache/solr/highlight/HighlighterTest.java b/solr/core/src/test/org/apache/solr/highlight/HighlighterTest.java index 391a1fc..8a84949 100755 --- a/solr/core/src/test/org/apache/solr/highlight/HighlighterTest.java +++ b/solr/core/src/test/org/apache/solr/highlight/HighlighterTest.java @@ -150,6 +150,25 @@ public class HighlighterTest extends SolrTestCaseJ4 { "//arr[@name='tv_text']/str[.=' long fragments.']" ); } + + @Test + public void testTermVectorWithoutOffsetsHighlight() { + + HashMap args = new HashMap(); + args.put("hl", "true"); + args.put("hl.fl", "tv_no_off_text"); + + TestHarness.LocalRequestFactory sumLRF = h.getRequestFactory("standard", 0, 200, args); + + assertU(adoc("tv_no_off_text", "Crackerjack Cameron", "id", "1")); + assertU(commit()); + assertU(optimize()); + + assertQ("Fields with term vectors switched on but no offsets should be correctly highlighted", + sumLRF.makeRequest("tv_no_off_text:cameron"), + "//arr[@name='tv_no_off_text']/str[.='Crackerjack Cameron']"); + + } @Test public void testTermOffsetsTokenStream() throws Exception {