Index: solr/src/java/org/apache/solr/highlight/DefaultSolrHighlighter.java =================================================================== --- solr/src/java/org/apache/solr/highlight/DefaultSolrHighlighter.java (revision 1076920) +++ solr/src/java/org/apache/solr/highlight/DefaultSolrHighlighter.java (working copy) @@ -435,12 +435,20 @@ // fall back to analyzer tstream = createAnalyzerTStream(schema, fieldName, docTexts[j]); } - + + int maxCharsToAnalyze = params.getFieldInt(fieldName, + HighlightParams.MAX_CHARS, + Highlighter.DEFAULT_MAX_CHARS_TO_ANALYZE); + Highlighter highlighter; if (Boolean.valueOf(req.getParams().get(HighlightParams.USE_PHRASE_HIGHLIGHTER, "true"))) { // TODO: this is not always necessary - eventually we would like to avoid this wrap // when it is not needed. - tstream = new CachingTokenFilter(tstream); + if (maxCharsToAnalyze < 0) { + tstream = new CachingTokenFilter(tstream); + } else { + tstream = new CachingTokenFilter(new OffsetLimitTokenFilter(tstream, maxCharsToAnalyze)); + } // get highlighter highlighter = getPhraseHighlighter(query, fieldName, req, (CachingTokenFilter) tstream); @@ -453,9 +461,7 @@ highlighter = getHighlighter(query, fieldName, req); } - int maxCharsToAnalyze = params.getFieldInt(fieldName, - HighlightParams.MAX_CHARS, - Highlighter.DEFAULT_MAX_CHARS_TO_ANALYZE); + if (maxCharsToAnalyze < 0) { highlighter.setMaxDocCharsToAnalyze(docTexts[j].length()); } else { Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java (revision 1078240) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java (working copy) @@ -197,6 +197,11 @@ tokenStream.reset(); TextFragment currentFrag = new TextFragment(newText,newText.length(), docFrags.size()); + + if (fragmentScorer instanceof QueryScorer) { + ((QueryScorer) fragmentScorer).setMaxDocCharsToAnalyze(maxDocCharsToAnalyze); + } + TokenStream newStream = fragmentScorer.init(tokenStream); if(newStream != null) { tokenStream = newStream; Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java (revision 1076920) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java (working copy) @@ -56,6 +56,7 @@ private boolean expandMultiTermQuery; private boolean cachedTokenStream; private boolean wrapToCaching = true; + private int maxDocCharsToAnalyze; public WeightedSpanTermExtractor() { } @@ -320,13 +321,13 @@ private AtomicReaderContext getLeafContextForField(String field) throws IOException { if(wrapToCaching && !cachedTokenStream && !(tokenStream instanceof CachingTokenFilter)) { - tokenStream = new CachingTokenFilter(tokenStream); + tokenStream = new CachingTokenFilter(new OffsetLimitTokenFilter(tokenStream, maxDocCharsToAnalyze)); cachedTokenStream = true; } AtomicReaderContext context = readers.get(field); if (context == null) { MemoryIndex indexer = new MemoryIndex(); - indexer.addField(field, tokenStream); + indexer.addField(field, new OffsetLimitTokenFilter(tokenStream, maxDocCharsToAnalyze)); tokenStream.reset(); IndexSearcher searcher = indexer.createSearcher(); // MEM index has only atomic ctx @@ -545,4 +546,8 @@ public void setWrapIfNotCachingTokenFilter(boolean wrap) { this.wrapToCaching = wrap; } + + protected final void setMaxDocCharsToAnalyze(int maxDocCharsToAnalyze) { + this.maxDocCharsToAnalyze = maxDocCharsToAnalyze; + } } Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/QueryScorer.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/QueryScorer.java (revision 1076920) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/QueryScorer.java (working copy) @@ -54,6 +54,7 @@ private IndexReader reader; private boolean skipInitExtractor; private boolean wrapToCaching = true; + private int maxCharsToAnalyze; /** * @param query Query to use for highlighting @@ -209,7 +210,7 @@ private TokenStream initExtractor(TokenStream tokenStream) throws IOException { WeightedSpanTermExtractor qse = defaultField == null ? new WeightedSpanTermExtractor() : new WeightedSpanTermExtractor(defaultField); - + qse.setMaxDocCharsToAnalyze(maxCharsToAnalyze); qse.setExpandMultiTermQuery(expandMultiTermQuery); qse.setWrapIfNotCachingTokenFilter(wrapToCaching); if (reader == null) { @@ -265,4 +266,8 @@ public void setWrapIfNotCachingTokenFilter(boolean wrap) { this.wrapToCaching = wrap; } + + public void setMaxDocCharsToAnalyze(int maxDocCharsToAnalyze) { + this.maxCharsToAnalyze = maxDocCharsToAnalyze; + } } Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/OffsetLimitTokenFilter.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/OffsetLimitTokenFilter.java (revision 0) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/OffsetLimitTokenFilter.java (revision 0) @@ -0,0 +1,57 @@ +package org.apache.lucene.search.highlight; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; + +/** + * This TokenFilter limits the number of tokens while indexing by adding up the + * current offset. + */ +public final class OffsetLimitTokenFilter extends TokenFilter { + + private int offsetCount; + private OffsetAttribute offsetAttrib = getAttribute(OffsetAttribute.class); + private int offsetLimit; + + public OffsetLimitTokenFilter(TokenStream input, int offsetLimit) { + super(input); + this.offsetLimit = offsetLimit; + } + + @Override + public boolean incrementToken() throws IOException { + if (offsetCount < offsetLimit && input.incrementToken()) { + int offsetLength = offsetAttrib.endOffset() - offsetAttrib.startOffset(); + offsetCount += offsetLength; + return true; + } + return false; + } + + @Override + public void reset() throws IOException { + super.reset(); + offsetCount = 0; + } + +} Index: lucene/contrib/highlighter/src/test/org/apache/lucene/search/highlight/OffsetLimitTokenFilterTest.java =================================================================== --- lucene/contrib/highlighter/src/test/org/apache/lucene/search/highlight/OffsetLimitTokenFilterTest.java (revision 0) +++ lucene/contrib/highlighter/src/test/org/apache/lucene/search/highlight/OffsetLimitTokenFilterTest.java (revision 0) @@ -0,0 +1,60 @@ +package org.apache.lucene.search.highlight; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; +import java.io.StringReader; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenStream; + +public class OffsetLimitTokenFilterTest extends BaseTokenStreamTestCase { + + public void testFilter() throws Exception { + TokenStream stream = new MockTokenizer(new StringReader( + "short toolong evenmuchlongertext a ab toolong foo"), + MockTokenizer.WHITESPACE, false); + OffsetLimitTokenFilter filter = new OffsetLimitTokenFilter(stream, 10); + assertTokenStreamContents(filter, new String[] {"short", "toolong"}); + + stream = new MockTokenizer(new StringReader( + "short toolong evenmuchlongertext a ab toolong foo"), + MockTokenizer.WHITESPACE, false); + filter = new OffsetLimitTokenFilter(stream, 12); + assertTokenStreamContents(filter, new String[] {"short", "toolong"}); + + stream = new MockTokenizer(new StringReader( + "short toolong evenmuchlongertext a ab toolong foo"), + MockTokenizer.WHITESPACE, false); + filter = new OffsetLimitTokenFilter(stream, 30); + assertTokenStreamContents(filter, new String[] {"short", "toolong", + "evenmuchlongertext"}); + + + checkOneTermReuse(new Analyzer() { + + @Override + public TokenStream tokenStream(String fieldName, Reader reader) { + return new OffsetLimitTokenFilter(new MockTokenizer(reader, + MockTokenizer.WHITESPACE, false), 10); + } + }, "llenges", "llenges"); + } +}