Index: solr/src/java/org/apache/solr/highlight/DefaultSolrHighlighter.java =================================================================== --- solr/src/java/org/apache/solr/highlight/DefaultSolrHighlighter.java (revision 1076920) +++ solr/src/java/org/apache/solr/highlight/DefaultSolrHighlighter.java (working copy) @@ -435,12 +435,20 @@ // fall back to analyzer tstream = createAnalyzerTStream(schema, fieldName, docTexts[j]); } - + + int maxCharsToAnalyze = params.getFieldInt(fieldName, + HighlightParams.MAX_CHARS, + Highlighter.DEFAULT_MAX_CHARS_TO_ANALYZE); + Highlighter highlighter; if (Boolean.valueOf(req.getParams().get(HighlightParams.USE_PHRASE_HIGHLIGHTER, "true"))) { // TODO: this is not always necessary - eventually we would like to avoid this wrap // when it is not needed. - tstream = new CachingTokenFilter(tstream); + if (maxCharsToAnalyze < 0) { + tstream = new CachingTokenFilter(tstream); + } else { + tstream = new CachingTokenFilter(new OffsetLimitTokenFilter(tstream, maxCharsToAnalyze)); + } // get highlighter highlighter = getPhraseHighlighter(query, fieldName, req, (CachingTokenFilter) tstream); @@ -453,9 +461,7 @@ highlighter = getHighlighter(query, fieldName, req); } - int maxCharsToAnalyze = params.getFieldInt(fieldName, - HighlightParams.MAX_CHARS, - Highlighter.DEFAULT_MAX_CHARS_TO_ANALYZE); + if (maxCharsToAnalyze < 0) { highlighter.setMaxDocCharsToAnalyze(docTexts[j].length()); } else { Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/QueryTermScorer.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/QueryTermScorer.java (revision 1076920) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/QueryTermScorer.java (working copy) @@ -94,7 +94,7 @@ /* (non-Javadoc) * @see org.apache.lucene.search.highlight.Scorer#init(org.apache.lucene.analysis.TokenStream) */ - public TokenStream init(TokenStream tokenStream) { + public TokenStream init(TokenStream tokenStream, int maxDocCharsToAnalyze) { termAtt = tokenStream.addAttribute(CharTermAttribute.class); return null; } Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Scorer.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Scorer.java (revision 1076920) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Scorer.java (working copy) @@ -37,7 +37,7 @@ * using the same {@link TokenStream} that was passed in. * @throws IOException */ - public TokenStream init(TokenStream tokenStream) throws IOException; + public TokenStream init(TokenStream tokenStream, int maxDocCharsToAnalyze) throws IOException; /** * Called when a new fragment is started for consideration. Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java (revision 1076920) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java (working copy) @@ -197,7 +197,7 @@ tokenStream.reset(); TextFragment currentFrag = new TextFragment(newText,newText.length(), docFrags.size()); - TokenStream newStream = fragmentScorer.init(tokenStream); + TokenStream newStream = fragmentScorer.init(tokenStream, maxDocCharsToAnalyze); if(newStream != null) { tokenStream = newStream; } Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java (revision 1076920) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java (working copy) @@ -56,6 +56,7 @@ private boolean expandMultiTermQuery; private boolean cachedTokenStream; private boolean wrapToCaching = true; + private int maxDocCharsToAnalyze; public WeightedSpanTermExtractor() { } @@ -320,13 +321,13 @@ private AtomicReaderContext getLeafContextForField(String field) throws IOException { if(wrapToCaching && !cachedTokenStream && !(tokenStream instanceof CachingTokenFilter)) { - tokenStream = new CachingTokenFilter(tokenStream); + tokenStream = new CachingTokenFilter(new OffsetLimitTokenFilter(tokenStream, maxDocCharsToAnalyze)); cachedTokenStream = true; } AtomicReaderContext context = readers.get(field); if (context == null) { MemoryIndex indexer = new MemoryIndex(); - indexer.addField(field, tokenStream); + indexer.addField(field, new OffsetLimitTokenFilter(tokenStream, maxDocCharsToAnalyze)); tokenStream.reset(); IndexSearcher searcher = indexer.createSearcher(); // MEM index has only atomic ctx @@ -545,4 +546,8 @@ public void setWrapIfNotCachingTokenFilter(boolean wrap) { this.wrapToCaching = wrap; } + + protected final void setMaxDocCharsToAnalyze(int maxDocCharsToAnalyze) { + this.maxDocCharsToAnalyze = maxDocCharsToAnalyze; + } } Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/QueryScorer.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/QueryScorer.java (revision 1076920) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/QueryScorer.java (working copy) @@ -54,6 +54,7 @@ private IndexReader reader; private boolean skipInitExtractor; private boolean wrapToCaching = true; + private int maxCharsToAnalyze; /** * @param query Query to use for highlighting @@ -173,7 +174,8 @@ /* (non-Javadoc) * @see org.apache.lucene.search.highlight.Scorer#init(org.apache.lucene.analysis.TokenStream) */ - public TokenStream init(TokenStream tokenStream) throws IOException { + public TokenStream init(TokenStream tokenStream, int maxDocCharsToAnalyze) throws IOException { + this.maxCharsToAnalyze = maxDocCharsToAnalyze; position = -1; termAtt = tokenStream.addAttribute(CharTermAttribute.class); posIncAtt = tokenStream.addAttribute(PositionIncrementAttribute.class); @@ -209,7 +211,7 @@ private TokenStream initExtractor(TokenStream tokenStream) throws IOException { WeightedSpanTermExtractor qse = defaultField == null ? new WeightedSpanTermExtractor() : new WeightedSpanTermExtractor(defaultField); - + qse.setMaxDocCharsToAnalyze(maxCharsToAnalyze); qse.setExpandMultiTermQuery(expandMultiTermQuery); qse.setWrapIfNotCachingTokenFilter(wrapToCaching); if (reader == null) { Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/OffsetLimitTokenFilter.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/OffsetLimitTokenFilter.java (revision 0) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/OffsetLimitTokenFilter.java (revision 0) @@ -0,0 +1,57 @@ +package org.apache.lucene.search.highlight; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; + +/** + * This TokenFilter limits the number of tokens while indexing by adding up the + * current offset. + */ +public final class OffsetLimitTokenFilter extends TokenFilter { + + private int offsetCount; + private OffsetAttribute offsetAttrib = getAttribute(OffsetAttribute.class); + private int offsetLimit; + + public OffsetLimitTokenFilter(TokenStream input, int offsetLimit) { + super(input); + this.offsetLimit = offsetLimit; + } + + @Override + public boolean incrementToken() throws IOException { + int offsetLength = offsetAttrib.endOffset() - offsetAttrib.startOffset(); + if (offsetCount < offsetLimit && input.incrementToken()) { + offsetCount += offsetLength; + return true; + } + return false; + } + + @Override + public void reset() throws IOException { + super.reset(); + offsetCount = 0; + } + +} \ No newline at end of file Index: lucene/contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java =================================================================== --- lucene/contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java (revision 1076920) +++ lucene/contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java (working copy) @@ -1274,7 +1274,7 @@ return 1; } - public TokenStream init(TokenStream tokenStream) { + public TokenStream init(TokenStream tokenStream, int maxDocCharsToAnalyze) { return null; } });