Index: lucene/contrib/highlighter/src/java/overview.html =================================================================== --- lucene/contrib/highlighter/src/java/overview.html (revision 956773) +++ lucene/contrib/highlighter/src/java/overview.html (working copy) @@ -1,27 +0,0 @@ - - - - - Highlighter - - - - The highlight package contains classes to provide "keyword in context" features - typically used to highlight search terms in the text of results pages. - - \ No newline at end of file Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/QueryTermScorer.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/QueryTermScorer.java (revision 956773) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/QueryTermScorer.java (working copy) @@ -1,162 +0,0 @@ -package org.apache.lucene.search.highlight; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.util.HashMap; -import java.util.HashSet; - -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; -import org.apache.lucene.index.IndexReader; -import org.apache.lucene.search.Query; - -/** - * {@link Scorer} implementation which scores text fragments by the number of - * unique query terms found. This class uses the {@link QueryTermExtractor} - * class to process determine the query terms and their boosts to be used. - */ -// TODO: provide option to boost score of fragments near beginning of document -// based on fragment.getFragNum() -public class QueryTermScorer implements Scorer { - - TextFragment currentTextFragment = null; - HashSet uniqueTermsInFragment; - - float totalScore = 0; - float maxTermWeight = 0; - private HashMap termsToFind; - - private CharTermAttribute termAtt; - - /** - * - * @param query a Lucene query (ideally rewritten using query.rewrite before - * being passed to this class and the searcher) - */ - public QueryTermScorer(Query query) { - this(QueryTermExtractor.getTerms(query)); - } - - /** - * - * @param query a Lucene query (ideally rewritten using query.rewrite before - * being passed to this class and the searcher) - * @param fieldName the Field name which is used to match Query terms - */ - public QueryTermScorer(Query query, String fieldName) { - this(QueryTermExtractor.getTerms(query, false, fieldName)); - } - - /** - * - * @param query a Lucene query (ideally rewritten using query.rewrite before - * being passed to this class and the searcher) - * @param reader used to compute IDF which can be used to a) score selected - * fragments better b) use graded highlights eg set font color - * intensity - * @param fieldName the field on which Inverse Document Frequency (IDF) - * calculations are based - */ - public QueryTermScorer(Query query, IndexReader reader, String fieldName) { - this(QueryTermExtractor.getIdfWeightedTerms(query, reader, fieldName)); - } - - public QueryTermScorer(WeightedTerm[] weightedTerms) { - termsToFind = new HashMap(); - for (int i = 0; i < weightedTerms.length; i++) { - WeightedTerm existingTerm = termsToFind - .get(weightedTerms[i].term); - if ((existingTerm == null) - || (existingTerm.weight < weightedTerms[i].weight)) { - // if a term is defined more than once, always use the highest scoring - // weight - termsToFind.put(weightedTerms[i].term, weightedTerms[i]); - maxTermWeight = Math.max(maxTermWeight, weightedTerms[i].getWeight()); - } - } - } - - /* (non-Javadoc) - * @see org.apache.lucene.search.highlight.Scorer#init(org.apache.lucene.analysis.TokenStream) - */ - public TokenStream init(TokenStream tokenStream) { - termAtt = tokenStream.addAttribute(CharTermAttribute.class); - return null; - } - - /* - * (non-Javadoc) - * - * @see - * org.apache.lucene.search.highlight.FragmentScorer#startFragment(org.apache - * .lucene.search.highlight.TextFragment) - */ - public void startFragment(TextFragment newFragment) { - uniqueTermsInFragment = new HashSet(); - currentTextFragment = newFragment; - totalScore = 0; - - } - - - /* (non-Javadoc) - * @see org.apache.lucene.search.highlight.Scorer#getTokenScore() - */ - public float getTokenScore() { - String termText = termAtt.toString(); - - WeightedTerm queryTerm = termsToFind.get(termText); - if (queryTerm == null) { - // not a query term - return - return 0; - } - // found a query term - is it unique in this doc? - if (!uniqueTermsInFragment.contains(termText)) { - totalScore += queryTerm.getWeight(); - uniqueTermsInFragment.add(termText); - } - return queryTerm.getWeight(); - } - - - /* (non-Javadoc) - * @see org.apache.lucene.search.highlight.Scorer#getFragmentScore() - */ - public float getFragmentScore() { - return totalScore; - } - - /* - * (non-Javadoc) - * - * @see - * org.apache.lucene.search.highlight.FragmentScorer#allFragmentsProcessed() - */ - public void allFragmentsProcessed() { - // this class has no special operations to perform at end of processing - } - - /** - * - * @return The highest weighted term (useful for passing to GradientFormatter - * to set top end of coloring scale. - */ - public float getMaxTermWeight() { - return maxTermWeight; - } -} Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/SimpleSpanFragmenter.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/SimpleSpanFragmenter.java (revision 956773) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/SimpleSpanFragmenter.java (working copy) @@ -1,108 +0,0 @@ -package org.apache.lucene.search.highlight; - - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -import java.util.List; - -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; -import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; -import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; -import org.apache.lucene.search.spans.Spans; - - -/** - * {@link Fragmenter} implementation which breaks text up into same-size - * fragments but does not split up {@link Spans}. This is a simple sample class. - */ -public class SimpleSpanFragmenter implements Fragmenter { - private static final int DEFAULT_FRAGMENT_SIZE = 100; - private int fragmentSize; - private int currentNumFrags; - private int position = -1; - private QueryScorer queryScorer; - private int waitForPos = -1; - private int textSize; - private CharTermAttribute termAtt; - private PositionIncrementAttribute posIncAtt; - private OffsetAttribute offsetAtt; - - /** - * @param queryScorer QueryScorer that was used to score hits - */ - public SimpleSpanFragmenter(QueryScorer queryScorer) { - this(queryScorer, DEFAULT_FRAGMENT_SIZE); - } - - /** - * @param queryScorer QueryScorer that was used to score hits - * @param fragmentSize size in bytes of each fragment - */ - public SimpleSpanFragmenter(QueryScorer queryScorer, int fragmentSize) { - this.fragmentSize = fragmentSize; - this.queryScorer = queryScorer; - } - - /* (non-Javadoc) - * @see org.apache.lucene.search.highlight.Fragmenter#isNewFragment() - */ - public boolean isNewFragment() { - position += posIncAtt.getPositionIncrement(); - - if (waitForPos == position) { - waitForPos = -1; - } else if (waitForPos != -1) { - return false; - } - - WeightedSpanTerm wSpanTerm = queryScorer.getWeightedSpanTerm(termAtt.toString()); - - if (wSpanTerm != null) { - List positionSpans = wSpanTerm.getPositionSpans(); - - for (int i = 0; i < positionSpans.size(); i++) { - if (positionSpans.get(i).start == position) { - waitForPos = positionSpans.get(i).end + 1; - break; - } - } - } - - boolean isNewFrag = offsetAtt.endOffset() >= (fragmentSize * currentNumFrags) - && (textSize - offsetAtt.endOffset()) >= (fragmentSize >>> 1); - - if (isNewFrag) { - currentNumFrags++; - } - - return isNewFrag; - } - - - /* (non-Javadoc) - * @see org.apache.lucene.search.highlight.Fragmenter#start(java.lang.String, org.apache.lucene.analysis.TokenStream) - */ - public void start(String originalText, TokenStream tokenStream) { - position = -1; - currentNumFrags = 1; - textSize = originalText.length(); - termAtt = tokenStream.addAttribute(CharTermAttribute.class); - posIncAtt = tokenStream.addAttribute(PositionIncrementAttribute.class); - offsetAtt = tokenStream.addAttribute(OffsetAttribute.class); - } -} Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/NullFragmenter.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/NullFragmenter.java (revision 956773) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/NullFragmenter.java (working copy) @@ -1,33 +0,0 @@ -package org.apache.lucene.search.highlight; -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.lucene.analysis.TokenStream; - -/** - * {@link Fragmenter} implementation which does not fragment the text. - * This is useful for highlighting the entire content of a document or field. - */ -public class NullFragmenter implements Fragmenter { - public void start(String s, TokenStream tokenStream) { - } - - public boolean isNewFragment() { - return false; - } - -} Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/SimpleHTMLFormatter.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/SimpleHTMLFormatter.java (revision 956773) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/SimpleHTMLFormatter.java (working copy) @@ -1,59 +0,0 @@ -package org.apache.lucene.search.highlight; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * Simple {@link Formatter} implementation to highlight terms with a pre and - * post tag. - */ -public class SimpleHTMLFormatter implements Formatter { - - private static final String DEFAULT_PRE_TAG = ""; - private static final String DEFAULT_POST_TAG = ""; - - private String preTag; - private String postTag; - - public SimpleHTMLFormatter(String preTag, String postTag) { - this.preTag = preTag; - this.postTag = postTag; - } - - /** Default constructor uses HTML: <B> tags to markup terms. */ - public SimpleHTMLFormatter() { - this(DEFAULT_PRE_TAG, DEFAULT_POST_TAG); - } - - /* (non-Javadoc) - * @see org.apache.lucene.search.highlight.Formatter#highlightTerm(java.lang.String, org.apache.lucene.search.highlight.TokenGroup) - */ - public String highlightTerm(String originalText, TokenGroup tokenGroup) { - if (tokenGroup.getTotalScore() <= 0) { - return originalText; - } - - // Allocate StringBuilder with the right number of characters from the - // beginning, to avoid char[] allocations in the middle of appends. - StringBuilder returnBuffer = new StringBuilder(preTag.length() + originalText.length() + postTag.length()); - returnBuffer.append(preTag); - returnBuffer.append(originalText); - returnBuffer.append(postTag); - return returnBuffer.toString(); - } - -} Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/formatting/Formatter.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/formatting/Formatter.java (revision 0) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/formatting/Formatter.java (revision 0) @@ -0,0 +1,30 @@ +package org.apache.lucene.search.highlight.formatting; + +/** + * Interface that defines a class that is used to format + * highlighted text. In order for implementors of this + * interface to threadsafe, {@link #highlight(String)} + * must not modify the state of the implementing class. + * @author Edward Drapkin + */ +public interface Formatter { + /** + * Highlight the specified CharSequence, a token to be highlighted. + * e.g. In the sentence "I searched for this", if we were + * highlighting the word "searched", a String with value "searched" + * would be given to this method. + * + * @param toHighlight the String to highlight. + * @return the highlighted String. + */ + public String highlight(String toHighlight); + + /** + * Returns the tag that will be prepended to highlighted words. + */ + public String getPreTag(); + /** + * Returns the tag that will be appended to highlighted words. + */ + public String getPostTag(); +} Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/formatting/HTMLTagFormatter.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/formatting/HTMLTagFormatter.java (revision 0) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/formatting/HTMLTagFormatter.java (revision 0) @@ -0,0 +1,75 @@ +package org.apache.lucene.search.highlight.formatting; + +/** + * {@link Formatter} that highlights using the supplied HTML tags. + * @author Edward Drapkin + * + */ +public class HTMLTagFormatter implements Formatter { + private final String startTag; + private final String endTag; + + /** + * Creates a new HTMLTagFormatter using startTag and + * endTag. startTag will be prepended and + * and endTag will be appended to highlighted + * phrases. + * @param startTag + * @param endTag + */ + public HTMLTagFormatter(String startTag, String endTag) { + this.startTag = startTag; + this.endTag = endTag; + } + + //inherit javadoc + public String highlight(String toHighlight) { + StringBuilder builder = new StringBuilder(this.startTag.length() + this.endTag.length() + toHighlight.length()); + builder.append(this.startTag); + builder.append(htmlEscape(toHighlight)); + builder.append(this.endTag); + return builder.toString(); + } + + private final static CharSequence htmlEscape(String text) { + if(text == null || text.length() == 0) { + return new StringBuilder(); + } + + StringBuilder builder = new StringBuilder(text.length()); + + for(int i = 0; i < text.length(); i++) { + char c = text.charAt(i); + switch(c) { + case '"': + builder.append("""); + break; + case '&': + builder.append("&"); + break; + case '<': + builder.append("<"); + break; + case '>': + builder.append(">"); + break; + default: + if(c > 128) { + builder.append("&#").append((int)c).append(";"); + } else { + builder.append(c); + } + } + } + + return builder; + } + + public String getPreTag() { + return startTag; + } + + public String getPostTag() { + return endTag; + } +} Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/formatting/NullFormatter.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/formatting/NullFormatter.java (revision 0) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/formatting/NullFormatter.java (revision 0) @@ -0,0 +1,20 @@ +package org.apache.lucene.search.highlight.formatting; + +/** + * Simple Formatter that returns the highlighted segments unaltered. + */ +public class NullFormatter implements Formatter { + + public String highlight(String toHighlight) { + return toHighlight; + } + + public String getPreTag() { + return ""; + } + + public String getPostTag() { + return ""; + } + +} Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/iterative/IterativeHighlighter.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/iterative/IterativeHighlighter.java (revision 0) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/iterative/IterativeHighlighter.java (revision 0) @@ -0,0 +1,129 @@ +package org.apache.lucene.search.highlight.iterative; + +import java.io.IOException; +import java.io.StringReader; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.search.highlight.formatting.Formatter; +import org.apache.lucene.search.highlight.formatting.HTMLTagFormatter; +import org.apache.lucene.search.highlight.iterative.scoring.Scorer; + +public class IterativeHighlighter { + public static final int DEFAULT_NUM_SEGMENTS = 3; + public static final int DEFAULT_SEGMENT_LENGTH = 200; + + private final Processor processor; + + public IterativeHighlighter(Scorer scorer) { + this(scorer, new HTMLTagFormatter("", "")); + } + + public IterativeHighlighter(Scorer scorer, Formatter formatter) { + this.processor = new Processor(scorer.getTermScores(), formatter); + } + + /** + * @depracated the phrase "fragment" has become "segment." + */ + public String getBestFragment(Analyzer analyzer, String fieldName, String text) throws IOException { + TokenStream tokenStream = analyzer.tokenStream(fieldName, new StringReader(text)); + return this.getBestFragment(tokenStream, text); + } + + /** + * @depracated the phrase "fragment" has become "segment." + */ + public String getBestFragment(TokenStream tokenStream, String text) throws IOException { + return this.getBestFragment(tokenStream, text, DEFAULT_SEGMENT_LENGTH); + } + + /** + * @depracated the phrase "fragment" has become "segment." + */ + public String getBestFragment(TokenStream tokenStream, String text, int segmentLength) throws IOException { + return processor.highlightSegments(tokenStream, text, segmentLength, 1)[0]; + } + + /** + * @depracated the phrase "fragment" has become "segment." + */ + public String[] getBestFragments(Analyzer analyzer, String fieldName, String text) throws IOException { + return this.getBestFragments(analyzer, fieldName, text, DEFAULT_NUM_SEGMENTS, DEFAULT_SEGMENT_LENGTH); + } + + /** + * @depracated the phrase "fragment" has become "segment." + */ + public String[] getBestFragments(Analyzer analyzer, String fieldName, String text, int numSegments) throws IOException { + return this.getBestFragments(analyzer, fieldName, text, numSegments, DEFAULT_SEGMENT_LENGTH); + } + + /** + * @depracated the phrase "fragment" has become "segment." + */ + public String[] getBestFragments(Analyzer analyzer, String fieldName, String text, int numSegments, int segmentLength) throws IOException { + TokenStream tokenStream = analyzer.tokenStream(fieldName, new StringReader(text)); + return this.getBestFragments(tokenStream, text, numSegments, segmentLength); + } + + /** + * @depracated the phrase "fragment" has become "segment." + */ + public String[] getBestFragments(TokenStream tokenStream, String text, int numSegments, int segmentLength) throws IOException { + return processor.highlightSegments(tokenStream, text, segmentLength, numSegments); + } + + + /* (non-Javadoc) + * @see org.apache.lucene.search.highlight.iterative.Highlighter#getBestSegment(org.apache.lucene.analysis.Analyzer, java.lang.String, java.lang.String) + */ + public String getBestSegment(Analyzer analyzer, String fieldName, String text) throws IOException { + TokenStream tokenStream = analyzer.tokenStream(fieldName, new StringReader(text)); + return this.getBestSegment(tokenStream, text); + } + + /* (non-Javadoc) + * @see org.apache.lucene.search.highlight.iterative.Highlighter#getBestSegment(org.apache.lucene.analysis.TokenStream, java.lang.String) + */ + public String getBestSegment(TokenStream tokenStream, String text) throws IOException { + return this.getBestSegment(tokenStream, text, DEFAULT_SEGMENT_LENGTH); + } + + /* (non-Javadoc) + * @see org.apache.lucene.search.highlight.iterative.Highlighter#getBestSegment(org.apache.lucene.analysis.TokenStream, java.lang.String, int) + */ + public String getBestSegment(TokenStream tokenStream, String text, int segmentLength) throws IOException { + return processor.highlightSegments(tokenStream, text, segmentLength, 1)[0]; + } + + /* (non-Javadoc) + * @see org.apache.lucene.search.highlight.iterative.Highlighter#getBestSegments(org.apache.lucene.analysis.Analyzer, java.lang.String, java.lang.String) + */ + public String[] getBestSegments(Analyzer analyzer, String fieldName, String text) throws IOException { + return this.getBestSegments(analyzer, fieldName, text, DEFAULT_NUM_SEGMENTS, DEFAULT_SEGMENT_LENGTH); + } + + /* (non-Javadoc) + * @see org.apache.lucene.search.highlight.iterative.Highlighter#getBestSegments(org.apache.lucene.analysis.Analyzer, java.lang.String, java.lang.String, int) + */ + public String[] getBestSegments(Analyzer analyzer, String fieldName, String text, int numSegments) throws IOException { + return this.getBestSegments(analyzer, fieldName, text, numSegments, DEFAULT_SEGMENT_LENGTH); + } + + /* (non-Javadoc) + * @see org.apache.lucene.search.highlight.iterative.Highlighter#getBestSegments(org.apache.lucene.analysis.Analyzer, java.lang.String, java.lang.String, int, int) + */ + public String[] getBestSegments(Analyzer analyzer, String fieldName, String text, int numSegments, int segmentLength) throws IOException { + TokenStream tokenStream = analyzer.tokenStream(fieldName, new StringReader(text)); + return this.getBestSegments(tokenStream, text, numSegments, segmentLength); + } + + /* (non-Javadoc) + * @see org.apache.lucene.search.highlight.iterative.Highlighter#getBestSegments(org.apache.lucene.analysis.TokenStream, java.lang.String, int, int) + */ + public String[] getBestSegments(TokenStream tokenStream, String text, int numSegments, int segmentLength) throws IOException { + return processor.highlightSegments(tokenStream, text, segmentLength, numSegments); + } + +} Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/iterative/Processor.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/iterative/Processor.java (revision 0) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/iterative/Processor.java (revision 0) @@ -0,0 +1,252 @@ +package org.apache.lucene.search.highlight.iterative; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.SortedSet; +import java.util.TreeSet; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.search.highlight.formatting.Formatter; + +/** + * Processor encapsulates the logic used to score a {@link TokenStream}. + * + * @author Edward Drapkin + */ +class Processor { + private final Map scores; + private final Formatter formatter; + /** + * Constructs a new Processor + * + * @param scores + * A Map where keys are the terms to score and the + * values are the scores for their respective term. Used for scoring + * highlighted text. + */ + public Processor(final Map scores, final Formatter formatter) { + this.scores = scores; + this.formatter = formatter; + } + + /** + * Highlights segments of text from the tokenStream. Returns the best + * numSegments segments from orginalText. Best, in this context, + * means the segment with the highest score where the score is calculated by + * multiplying each term with its given score and then adding the resultant + * scores together. + * + * The length of the segment is an "almost" comparison. The length will be to + * the end of the next complete word that begins at offset + * segmentLength. + * + * @param tokenStream + * the {@link TokenStream} from the original text. + * @param originalText + * the text to highlight. + * @param segmentLength + * the length of the segments to be returned. + * @param numSegments + * the number of segments to return + * @param formatter + * the {@link Formatter} used to highlight terms. + * @return a String array with the best highlighted segments. + * @throws IOException + */ + String[] highlightSegments(final TokenStream tokenStream, + final String originalText, final int segmentLength, + final int numSegments) throws IOException { + final OffsetMetaData metaData = this.scoreSegments(tokenStream, + segmentLength); + + final List segments = new ArrayList(numSegments); + + final Iterator offsetIter = metaData + .getSegmentOffsetsAndScores().iterator(); + + int prevOffset = 0; + for (int i = 0; (i < numSegments) && offsetIter.hasNext(); i++) { + final int nextOffset = offsetIter.next().offset; + + if(prevOffset + segmentLength > nextOffset) { //the segments overlap + continue; + } + + final int end = metaData.getOffsetsAndLengths() + .headSet(new OffsetAndLength(nextOffset + segmentLength, 0)).last().offset; + final StringBuilder seg = new StringBuilder(originalText.substring( + nextOffset, end)); + + final SortedSet toHighlight = metaData + .getOffsetsAndLengths() + .tailSet(new OffsetAndLength(nextOffset + segmentLength - 1, 0)) + .headSet(new OffsetAndLength(nextOffset - 1, 0)); + + for (final OffsetAndLength entry : toHighlight) { + final int offset = entry.offset - nextOffset; + final int length = entry.length; + final String highlightTerm = formatter.highlight(seg.substring(offset, + offset + length)); + seg.replace(offset, offset + length, highlightTerm); + + } + segments.add(seg.toString()); + prevOffset = nextOffset; + } + + return segments.toArray(new String[segments.size()]); + } + + /** + * Actually scores the {@link TokenStream}. + * @param tokenStream The TokenStream to score. + * @param segmentLength The length of the segments, in characters. + * Note that this is a rough estimation as the actual results + * may be slightly more or less, as trailing words may not fall + * exactly on the limit, so the returned segments will + * be truncated or expanded so that they are a complete + * set of tokens. + * @return An {@link OffsetMetaData} instance representing offsets + * and lengths, as well as scores. + * @throws IOException + */ + private OffsetMetaData scoreSegments(final TokenStream tokenStream, + final int segmentLength) throws IOException { + final List offsetWithScores = new ArrayList(); + + final SortedSet highlightOffsetsWithLength = new TreeSet( + Collections.reverseOrder(new OffsetComparator())); + + final CharTermAttribute termAtt = tokenStream + .addAttribute(CharTermAttribute.class); + final OffsetAttribute offsetAtt = tokenStream + .addAttribute(OffsetAttribute.class); + + tokenStream.reset(); + + int trackPos = 0; + int trackOff = 0; + + // iterate through the token stream + while (tokenStream.incrementToken()) { + final int offset = offsetAtt.startOffset(); + while (offset - trackOff > segmentLength && offsetWithScores.size() > trackPos) { + trackOff = offsetWithScores.get(trackPos).offset; + trackPos++; + } + + final String term = termAtt.toString(); + final Integer score = this.scores.get(term); + if (score != null) { + offsetWithScores.add(new OffsetAndScore(offsetAtt.startOffset(), score + .intValue())); + + highlightOffsetsWithLength.add(new OffsetAndLength(offsetAtt + .startOffset(), termAtt.length())); + + for (int i = trackPos; i < offsetWithScores.size(); i++) { + offsetWithScores.get(i).incScore(score.intValue()); + } + } else { + // not a score term, add to the score table with an initial + // score of 0 + offsetWithScores.add(new OffsetAndScore(offset, 0)); + } + + } + + final SortedSet scoredSegments = new TreeSet( + new Comparator() { + public int compare(final OffsetAndScore o1, final OffsetAndScore o2) { + if (o1.offset != o2.offset) { + return o2.getScore() - o1.getScore(); + } else { + return o1.offset - o2.offset; + } + } + + }); + scoredSegments.addAll(offsetWithScores); + return new OffsetMetaData(highlightOffsetsWithLength, scoredSegments); + } + + /** + * Class that wraps the requisite information needed to highlight text. + * Keeps track of the offsets and lengths for all "hits" as well as + * keeping track of segment scores. + */ + private class OffsetMetaData { + private final SortedSet offsetsAndLengths; + private final SortedSet segmentOffsetsAndScores; + + public OffsetMetaData(final SortedSet offsetsAndLengths, + final SortedSet segmentOffsetsAndScores) { + this.offsetsAndLengths = offsetsAndLengths; + this.segmentOffsetsAndScores = segmentOffsetsAndScores; + } + + public SortedSet getOffsetsAndLengths() { + return this.offsetsAndLengths; + } + + public SortedSet getSegmentOffsetsAndScores() { + return this.segmentOffsetsAndScores; + } + + } + + /* + * The following classes are a little... strange, however they encapsulate one + * or more primitive values and I had tested using classes like this in a + * sorted set method faster than using boxed primitive classes in a map. + */ + private static abstract class Offset { + protected final int offset; + + protected Offset(final int offset) { + this.offset = offset; + } + } + + private static final class OffsetAndScore extends Offset { + private int score; + + public OffsetAndScore(final int offset, final int score) { + super(offset); + this.score = score; + } + + public void incScore(final int toInc) { + this.score += toInc; + } + + public int getScore() { + return this.score; + } + + } + + private static final class OffsetAndLength extends Offset { + private final int length; + + public OffsetAndLength(final int offset, final int length) { + super(offset); + this.length = length; + } + } + + private static final class OffsetComparator implements + Comparator { + public int compare(final T o1, final T o2) { + return o1.offset - o2.offset; + } + } + +} Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/iterative/scoring/QueryScorer.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/iterative/scoring/QueryScorer.java (revision 0) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/iterative/scoring/QueryScorer.java (revision 0) @@ -0,0 +1,142 @@ +package org.apache.lucene.search.highlight.iterative.scoring; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import org.apache.lucene.index.Term; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.FilteredQuery; +import org.apache.lucene.search.Query; + +/** + * Class that "consumes" a query and extracts a term weight map. + * + * @author Edward Drapkin + */ +public class QueryScorer implements Scorer { + private final Query query; + private final boolean includeNotValues; + private final boolean mergeWeights; + private final Map terms = new HashMap(); + + /** + * Create a QueryConsumer with the given query and the default values for + * including "NOT" query parameters and merging weights (false, false); + * + * @param query + * the {@link Query} to extract terms from. + */ + public QueryScorer(final Query query) { + this(query, false, false); + } + + /** + * Create a QueryConsumer with the default value for merging weights (false). + * + * @param query + * the {@link Query} to extract terms from. + * @param includeNotValues + * whether to include terms for query parameters specified as "NOT" + */ + public QueryScorer(final Query query, final boolean includeNotValues) { + this(query, includeNotValues, false); + } + + /** + * Creates a QueryConsumer object. + * + * @param query + * the {@link Query} to extract terms from. + * @param includeNotValues + * whether to include terms for query parameters specified as "NOT" + * @param mergeWeights + * whether to merge weights of terms; when a term occurs more than + * once in a query, if this is false, the maximum value is used, + * whereas if this is true, the results are added together. + */ + public QueryScorer(final Query query, final boolean includeNotValues, + final boolean mergeWeights) { + this.query = query; + this.includeNotValues = includeNotValues; + this.mergeWeights = mergeWeights; + this.extractQueryTerms(); + } + + private void extractQueryTerms() { + final List queries = new ArrayList(); + queries.add(this.query); + int index = 0; + boolean hasMore = true; + + while (hasMore) { + final Query q = queries.get(index); + + index++; + + if (q instanceof BooleanQuery) { + for (final BooleanClause clause : ((BooleanQuery) q).getClauses()) { + if (this.includeNotValues + || (clause.getOccur() != BooleanClause.Occur.MUST_NOT)) { + queries.add(clause.getQuery()); + } + } + } else if (q instanceof FilteredQuery) { + queries.add(((FilteredQuery) q).getQuery()); + } else { + final Set terms = new HashSet(); + q.extractTerms(terms); + + for (final Term term : terms) { + final Integer termWeight = this.terms.get(term); + if (termWeight == null) { + this.terms.put(term.text(), Math.round(q.getBoost())); + } else if (this.mergeWeights) { + this.terms.put(term.text(), Math.round(q.getBoost()) + termWeight); + } else if (termWeight < q.getBoost()) { + this.terms.put(term.text(), Math.round(q.getBoost())); + } + } + } + + if (index >= queries.size() - 1) { + hasMore = false; + } + } + } + + /** + * @return the query that was used to extract {@link Term}s and weights. + */ + public Query getQuery() { + return this.query; + } + + /** + * @return whether this QueryConsumer is including "NOT" query parameters. + */ + public boolean isIncludeNotValues() { + return this.includeNotValues; + } + + /** + * @return an unmodifiable view of the term weight maps with {@link Term} keys + * and {@link Float} values. + */ + public Map getTermScores() { + return Collections.unmodifiableMap(this.terms); + } + + /** + * @return whether this QueryConsumer is merging the weights of terms that + * occur multiple times within the Query. + */ + public boolean isMergeWeights() { + return this.mergeWeights; + } +} Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/iterative/scoring/Scorer.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/iterative/scoring/Scorer.java (revision 0) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/iterative/scoring/Scorer.java (revision 0) @@ -0,0 +1,17 @@ +package org.apache.lucene.search.highlight.iterative.scoring; + +import java.util.Map; + +/** + * Interface that defines a class used to score words. Only words with + * a score should be present in the map, e.g. there should be no values + * of "0." + * @author Edward Drapkin + * + */ +public interface Scorer { + /** + * Return the map of Word->Score. + */ + public Map getTermScores(); +} Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java (revision 956773) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java (working copy) @@ -1,538 +1,51 @@ package org.apache.lucene.search.highlight; -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ import java.io.IOException; -import java.io.StringReader; -import java.util.ArrayList; -import java.util.Iterator; -import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; -import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; -import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; -import org.apache.lucene.util.PriorityQueue; - -/** - * Class used to markup highlighted terms found in the best sections of a - * text, using configurable {@link Fragmenter}, {@link Scorer}, {@link Formatter}, - * {@link Encoder} and tokenizers. - */ -public class Highlighter -{ - public static final int DEFAULT_MAX_CHARS_TO_ANALYZE = 50*1024; - - private int maxDocCharsToAnalyze = DEFAULT_MAX_CHARS_TO_ANALYZE; - private Formatter formatter; - private Encoder encoder; - private Fragmenter textFragmenter=new SimpleFragmenter(); - private Scorer fragmentScorer=null; - - public Highlighter(Scorer fragmentScorer) - { - this(new SimpleHTMLFormatter(),fragmentScorer); - } - - - public Highlighter(Formatter formatter, Scorer fragmentScorer) - { - this(formatter,new DefaultEncoder(),fragmentScorer); - } - - - public Highlighter(Formatter formatter, Encoder encoder, Scorer fragmentScorer) - { - this.formatter = formatter; - this.encoder = encoder; - this.fragmentScorer = fragmentScorer; - } - - /** - * Highlights chosen terms in a text, extracting the most relevant section. - * This is a convenience method that calls - * {@link #getBestFragment(TokenStream, String)} - * - * @param analyzer the analyzer that will be used to split text - * into chunks - * @param text text to highlight terms in - * @param fieldName Name of field used to influence analyzer's tokenization policy - * - * @return highlighted text fragment or null if no terms found - * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length - */ - public final String getBestFragment(Analyzer analyzer, String fieldName,String text) - throws IOException, InvalidTokenOffsetsException - { - TokenStream tokenStream = analyzer.tokenStream(fieldName, new StringReader(text)); - return getBestFragment(tokenStream, text); - } - - /** - * Highlights chosen terms in a text, extracting the most relevant section. - * The document text is analysed in chunks to record hit statistics - * across the document. After accumulating stats, the fragment with the highest score - * is returned - * - * @param tokenStream a stream of tokens identified in the text parameter, including offset information. - * This is typically produced by an analyzer re-parsing a document's - * text. Some work may be done on retrieving TokenStreams more efficiently - * by adding support for storing original text position data in the Lucene - * index but this support is not currently available (as of Lucene 1.4 rc2). - * @param text text to highlight terms in - * - * @return highlighted text fragment or null if no terms found - * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length - */ - public final String getBestFragment(TokenStream tokenStream, String text) - throws IOException, InvalidTokenOffsetsException - { - String[] results = getBestFragments(tokenStream,text, 1); - if (results.length > 0) - { - return results[0]; - } - return null; - } - - /** - * Highlights chosen terms in a text, extracting the most relevant sections. - * This is a convenience method that calls - * {@link #getBestFragments(TokenStream, String, int)} - * - * @param analyzer the analyzer that will be used to split text - * into chunks - * @param fieldName the name of the field being highlighted (used by analyzer) - * @param text text to highlight terms in - * @param maxNumFragments the maximum number of fragments. - * - * @return highlighted text fragments (between 0 and maxNumFragments number of fragments) - * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length - */ - public final String[] getBestFragments( - Analyzer analyzer, - String fieldName, - String text, - int maxNumFragments) - throws IOException, InvalidTokenOffsetsException - { - TokenStream tokenStream = analyzer.tokenStream(fieldName, new StringReader(text)); - return getBestFragments(tokenStream, text, maxNumFragments); - } - - /** - * Highlights chosen terms in a text, extracting the most relevant sections. - * The document text is analysed in chunks to record hit statistics - * across the document. After accumulating stats, the fragments with the highest scores - * are returned as an array of strings in order of score (contiguous fragments are merged into - * one in their original order to improve readability) - * - * @param text text to highlight terms in - * @param maxNumFragments the maximum number of fragments. - * - * @return highlighted text fragments (between 0 and maxNumFragments number of fragments) - * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length - */ - public final String[] getBestFragments( - TokenStream tokenStream, - String text, - int maxNumFragments) - throws IOException, InvalidTokenOffsetsException - { - maxNumFragments = Math.max(1, maxNumFragments); //sanity check - - TextFragment[] frag =getBestTextFragments(tokenStream,text, true,maxNumFragments); - - //Get text - ArrayList fragTexts = new ArrayList(); - for (int i = 0; i < frag.length; i++) - { - if ((frag[i] != null) && (frag[i].getScore() > 0)) - { - fragTexts.add(frag[i].toString()); - } - } - return fragTexts.toArray(new String[0]); - } - - - /** - * Low level api to get the most relevant (formatted) sections of the document. - * This method has been made public to allow visibility of score information held in TextFragment objects. - * Thanks to Jason Calabrese for help in redefining the interface. - * @param tokenStream - * @param text - * @param maxNumFragments - * @param mergeContiguousFragments - * @throws IOException - * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length - */ - public final TextFragment[] getBestTextFragments( - TokenStream tokenStream, - String text, - boolean mergeContiguousFragments, - int maxNumFragments) - throws IOException, InvalidTokenOffsetsException - { - ArrayList docFrags = new ArrayList(); - StringBuilder newText=new StringBuilder(); - - CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class); - OffsetAttribute offsetAtt = tokenStream.addAttribute(OffsetAttribute.class); - tokenStream.addAttribute(PositionIncrementAttribute.class); - tokenStream.reset(); - - TextFragment currentFrag = new TextFragment(newText,newText.length(), docFrags.size()); - TokenStream newStream = fragmentScorer.init(tokenStream); - if(newStream != null) { - tokenStream = newStream; - } - fragmentScorer.startFragment(currentFrag); - docFrags.add(currentFrag); - - FragmentQueue fragQueue = new FragmentQueue(maxNumFragments); - - try - { - - String tokenText; - int startOffset; - int endOffset; - int lastEndOffset = 0; - textFragmenter.start(text, tokenStream); - - TokenGroup tokenGroup=new TokenGroup(tokenStream); - - for (boolean next = tokenStream.incrementToken(); next && (offsetAtt.startOffset()< maxDocCharsToAnalyze); - next = tokenStream.incrementToken()) - { - if( (offsetAtt.endOffset()>text.length()) - || - (offsetAtt.startOffset()>text.length()) - ) - { - throw new InvalidTokenOffsetsException("Token "+ termAtt.toString() - +" exceeds length of provided text sized "+text.length()); - } - if((tokenGroup.numTokens>0)&&(tokenGroup.isDistinct())) - { - //the current token is distinct from previous tokens - - // markup the cached token group info - startOffset = tokenGroup.matchStartOffset; - endOffset = tokenGroup.matchEndOffset; - tokenText = text.substring(startOffset, endOffset); - String markedUpText=formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup); - //store any whitespace etc from between this and last group - if (startOffset > lastEndOffset) - newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset))); - newText.append(markedUpText); - lastEndOffset=Math.max(endOffset, lastEndOffset); - tokenGroup.clear(); - - //check if current token marks the start of a new fragment - if(textFragmenter.isNewFragment()) - { - currentFrag.setScore(fragmentScorer.getFragmentScore()); - //record stats for a new fragment - currentFrag.textEndPos = newText.length(); - currentFrag =new TextFragment(newText, newText.length(), docFrags.size()); - fragmentScorer.startFragment(currentFrag); - docFrags.add(currentFrag); - } - } - - tokenGroup.addToken(fragmentScorer.getTokenScore()); - -// if(lastEndOffset>maxDocBytesToAnalyze) -// { -// break; -// } - } - currentFrag.setScore(fragmentScorer.getFragmentScore()); - - if(tokenGroup.numTokens>0) - { - //flush the accumulated text (same code as in above loop) - startOffset = tokenGroup.matchStartOffset; - endOffset = tokenGroup.matchEndOffset; - tokenText = text.substring(startOffset, endOffset); - String markedUpText=formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup); - //store any whitespace etc from between this and last group - if (startOffset > lastEndOffset) - newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset))); - newText.append(markedUpText); - lastEndOffset=Math.max(lastEndOffset,endOffset); - } - - //Test what remains of the original text beyond the point where we stopped analyzing - if ( -// if there is text beyond the last token considered.. - (lastEndOffset < text.length()) - && -// and that text is not too large... - (text.length()<= maxDocCharsToAnalyze) - ) - { - //append it to the last fragment - newText.append(encoder.encodeText(text.substring(lastEndOffset))); - } - - currentFrag.textEndPos = newText.length(); - - //sort the most relevant sections of the text - for (Iterator i = docFrags.iterator(); i.hasNext();) - { - currentFrag = i.next(); - - //If you are running with a version of Lucene before 11th Sept 03 - // you do not have PriorityQueue.insert() - so uncomment the code below - /* - if (currentFrag.getScore() >= minScore) - { - fragQueue.put(currentFrag); - if (fragQueue.size() > maxNumFragments) - { // if hit queue overfull - fragQueue.pop(); // remove lowest in hit queue - minScore = ((TextFragment) fragQueue.top()).getScore(); // reset minScore - } +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Fieldable; +import org.apache.lucene.document.MapFieldSelector; +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.highlight.formatting.Formatter; +import org.apache.lucene.search.highlight.iterative.IterativeHighlighter; +import org.apache.lucene.search.highlight.iterative.scoring.QueryScorer; +import org.apache.lucene.search.highlight.termvector.FastVectorHighlighter; +import org.apache.lucene.search.highlight.termvector.FieldQuery; - - } - */ - //The above code caused a problem as a result of Christoph Goller's 11th Sept 03 - //fix to PriorityQueue. The correct method to use here is the new "insert" method - // USE ABOVE CODE IF THIS DOES NOT COMPILE! - fragQueue.insertWithOverflow(currentFrag); - } - - //return the most relevant fragments - TextFragment frag[] = new TextFragment[fragQueue.size()]; - for (int i = frag.length - 1; i >= 0; i--) - { - frag[i] = fragQueue.pop(); - } - - //merge any contiguous fragments to improve readability - if(mergeContiguousFragments) - { - mergeContiguousFragments(frag); - ArrayList fragTexts = new ArrayList(); - for (int i = 0; i < frag.length; i++) - { - if ((frag[i] != null) && (frag[i].getScore() > 0)) - { - fragTexts.add(frag[i]); - } - } - frag= fragTexts.toArray(new TextFragment[0]); - } - - return frag; - - } - finally - { - if (tokenStream != null) - { - try - { - tokenStream.close(); - } - catch (Exception e) - { - } - } - } - } - - - /** Improves readability of a score-sorted list of TextFragments by merging any fragments - * that were contiguous in the original text into one larger fragment with the correct order. - * This will leave a "null" in the array entry for the lesser scored fragment. - * - * @param frag An array of document fragments in descending score - */ - private void mergeContiguousFragments(TextFragment[] frag) - { - boolean mergingStillBeingDone; - if (frag.length > 1) - do - { - mergingStillBeingDone = false; //initialise loop control flag - //for each fragment, scan other frags looking for contiguous blocks - for (int i = 0; i < frag.length; i++) - { - if (frag[i] == null) - { - continue; - } - //merge any contiguous blocks - for (int x = 0; x < frag.length; x++) - { - if (frag[x] == null) - { - continue; - } - if (frag[i] == null) - { - break; - } - TextFragment frag1 = null; - TextFragment frag2 = null; - int frag1Num = 0; - int frag2Num = 0; - int bestScoringFragNum; - int worstScoringFragNum; - //if blocks are contiguous.... - if (frag[i].follows(frag[x])) - { - frag1 = frag[x]; - frag1Num = x; - frag2 = frag[i]; - frag2Num = i; - } - else - if (frag[x].follows(frag[i])) - { - frag1 = frag[i]; - frag1Num = i; - frag2 = frag[x]; - frag2Num = x; - } - //merging required.. - if (frag1 != null) - { - if (frag1.getScore() > frag2.getScore()) - { - bestScoringFragNum = frag1Num; - worstScoringFragNum = frag2Num; - } - else - { - bestScoringFragNum = frag2Num; - worstScoringFragNum = frag1Num; - } - frag1.merge(frag2); - frag[worstScoringFragNum] = null; - mergingStillBeingDone = true; - frag[bestScoringFragNum] = frag1; - } - } - } - } - while (mergingStillBeingDone); - } - - - /** - * Highlights terms in the text , extracting the most relevant sections - * and concatenating the chosen fragments with a separator (typically "..."). - * The document text is analysed in chunks to record hit statistics - * across the document. After accumulating stats, the fragments with the highest scores - * are returned in order as "separator" delimited strings. - * - * @param text text to highlight terms in - * @param maxNumFragments the maximum number of fragments. - * @param separator the separator used to intersperse the document fragments (typically "...") - * - * @return highlighted text - * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length - */ - public final String getBestFragments( - TokenStream tokenStream, - String text, - int maxNumFragments, - String separator) - throws IOException, InvalidTokenOffsetsException - { - String sections[] = getBestFragments(tokenStream,text, maxNumFragments); - StringBuilder result = new StringBuilder(); - for (int i = 0; i < sections.length; i++) - { - if (i > 0) - { - result.append(separator); - } - result.append(sections[i]); - } - return result.toString(); - } - - public int getMaxDocCharsToAnalyze() { - return maxDocCharsToAnalyze; +public class Highlighter { + public static final int DEFAULT_NUM_SEGMENTS = 3; + public static final int DEFAULT_SEGMENT_LENGTH = 200; + + private final Query query; + private final Formatter formatter; + + public Highlighter(Query query, Formatter formatter) { + this.query = query; + this.formatter = formatter; } - - public void setMaxDocCharsToAnalyze(int maxDocCharsToAnalyze) { - this.maxDocCharsToAnalyze = maxDocCharsToAnalyze; + + public String[] vectorHighlight(IndexReader reader, String fieldName, int docId, int segmentLength, int numSegments) throws IOException { + FastVectorHighlighter fvh = new FastVectorHighlighter(); + FieldQuery fq = fvh.getFieldQuery(query); + return fvh.getBestFragments(fq, reader, docId, fieldName, segmentLength, numSegments); } - - public Fragmenter getTextFragmenter() - { - return textFragmenter; - } - - /** - * @param fragmenter - */ - public void setTextFragmenter(Fragmenter fragmenter) - { - textFragmenter = fragmenter; - } - - /** - * @return Object used to score each text fragment - */ - public Scorer getFragmentScorer() - { - return fragmentScorer; - } - - - /** - * @param scorer - */ - public void setFragmentScorer(Scorer scorer) - { - fragmentScorer = scorer; - } + public String[] highlight(IndexReader reader, String fieldName, int docId, int segmentLength, int numSegments) throws CorruptIndexException, IOException { + Document doc = reader.document( docId, new MapFieldSelector( new String[]{ fieldName } ) ); + Fieldable[] fields = doc.getFields( fieldName ); - public Encoder getEncoder() - { - return encoder; + if(fields[0].isTermVectorStored()) { + return vectorHighlight(reader, fieldName, docId, segmentLength, numSegments); + } else { + return iterativeHighlight(fields[0].tokenStreamValue(), doc.get(fieldName), numSegments, segmentLength); } - public void setEncoder(Encoder encoder) - { - this.encoder = encoder; - } -} -class FragmentQueue extends PriorityQueue -{ - public FragmentQueue(int size) - { - initialize(size); - } - - @Override - public final boolean lessThan(TextFragment fragA, TextFragment fragB) - { - if (fragA.getScore() == fragB.getScore()) - return fragA.fragNum > fragB.fragNum; - else - return fragA.getScore() < fragB.getScore(); - } -} + } + + public String[] iterativeHighlight(TokenStream tokenStream, String text, int numSegments, int segmentLength) throws IOException { + IterativeHighlighter ih = new IterativeHighlighter(new QueryScorer(query), formatter); + return ih.getBestSegments(tokenStream, text, numSegments, segmentLength); + } +} \ No newline at end of file Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenGroup.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenGroup.java (revision 956773) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenGroup.java (working copy) @@ -1,132 +0,0 @@ -package org.apache.lucene.search.highlight; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.lucene.analysis.Token; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; -import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; - -/** - * One, or several overlapping tokens, along with the score(s) and the scope of - * the original text - */ -public class TokenGroup { - - private static final int MAX_NUM_TOKENS_PER_GROUP = 50; - Token [] tokens=new Token[MAX_NUM_TOKENS_PER_GROUP]; - float[] scores = new float[MAX_NUM_TOKENS_PER_GROUP]; - int numTokens = 0; - int startOffset = 0; - int endOffset = 0; - float tot; - int matchStartOffset, matchEndOffset; - - private OffsetAttribute offsetAtt; - private CharTermAttribute termAtt; - - public TokenGroup(TokenStream tokenStream) { - offsetAtt = tokenStream.addAttribute(OffsetAttribute.class); - termAtt = tokenStream.addAttribute(CharTermAttribute.class); - } - - void addToken(float score) { - if (numTokens < MAX_NUM_TOKENS_PER_GROUP) { - int termStartOffset = offsetAtt.startOffset(); - int termEndOffset = offsetAtt.endOffset(); - if (numTokens == 0) { - startOffset = matchStartOffset = termStartOffset; - endOffset = matchEndOffset = termEndOffset; - tot += score; - } else { - startOffset = Math.min(startOffset, termStartOffset); - endOffset = Math.max(endOffset, termEndOffset); - if (score > 0) { - if (tot == 0) { - matchStartOffset = offsetAtt.startOffset(); - matchEndOffset = offsetAtt.endOffset(); - } else { - matchStartOffset = Math.min(matchStartOffset, termStartOffset); - matchEndOffset = Math.max(matchEndOffset, termEndOffset); - } - tot += score; - } - } - Token token = new Token(termStartOffset, termEndOffset); - token.setEmpty().append(termAtt); - tokens[numTokens] = token; - scores[numTokens] = score; - numTokens++; - } - } - - boolean isDistinct() { - return offsetAtt.startOffset() >= endOffset; - } - - void clear() { - numTokens = 0; - tot = 0; - } - - /* - * @param index a value between 0 and numTokens -1 - * @return the "n"th token - */ - public Token getToken(int index) - { - return tokens[index]; - } - - /** - * - * @param index a value between 0 and numTokens -1 - * @return the "n"th score - */ - public float getScore(int index) { - return scores[index]; - } - - /** - * @return the end position in the original text - */ - public int getEndOffset() { - return endOffset; - } - - /** - * @return the number of tokens in this group - */ - public int getNumTokens() { - return numTokens; - } - - /** - * @return the start position in the original text - */ - public int getStartOffset() { - return startOffset; - } - - /** - * @return all tokens' scores summed up - */ - public float getTotalScore() { - return tot; - } -} Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/SimpleFragmenter.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/SimpleFragmenter.java (revision 956773) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/SimpleFragmenter.java (working copy) @@ -1,80 +0,0 @@ -package org.apache.lucene.search.highlight; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; - -/** - * {@link Fragmenter} implementation which breaks text up into same-size - * fragments with no concerns over spotting sentence boundaries. - */ -public class SimpleFragmenter implements Fragmenter { - private static final int DEFAULT_FRAGMENT_SIZE = 100; - private int currentNumFrags; - private int fragmentSize; - private OffsetAttribute offsetAtt; - - public SimpleFragmenter() { - this(DEFAULT_FRAGMENT_SIZE); - } - - /** - * - * @param fragmentSize size in number of characters of each fragment - */ - public SimpleFragmenter(int fragmentSize) { - this.fragmentSize = fragmentSize; - } - - - /* (non-Javadoc) - * @see org.apache.lucene.search.highlight.Fragmenter#start(java.lang.String, org.apache.lucene.analysis.TokenStream) - */ - public void start(String originalText, TokenStream stream) { - offsetAtt = stream.addAttribute(OffsetAttribute.class); - currentNumFrags = 1; - } - - - /* (non-Javadoc) - * @see org.apache.lucene.search.highlight.Fragmenter#isNewFragment() - */ - public boolean isNewFragment() { - boolean isNewFrag = offsetAtt.endOffset() >= (fragmentSize * currentNumFrags); - if (isNewFrag) { - currentNumFrags++; - } - return isNewFrag; - } - - /** - * @return size in number of characters of each fragment - */ - public int getFragmentSize() { - return fragmentSize; - } - - /** - * @param size size in characters of each fragment - */ - public void setFragmentSize(int size) { - fragmentSize = size; - } - -} Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Formatter.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Formatter.java (revision 956773) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Formatter.java (working copy) @@ -1,33 +0,0 @@ -package org.apache.lucene.search.highlight; -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - -/** - * Processes terms found in the original text, typically by applying some form - * of mark-up to highlight terms in HTML search results pages. - * - */ -public interface Formatter -{ - /** - * @param originalText The section of text being considered for markup - * @param tokenGroup contains one or several overlapping Tokens along with - * their scores and positions. - */ - String highlightTerm(String originalText, TokenGroup tokenGroup); -} Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java (revision 956773) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java (working copy) @@ -1,559 +0,0 @@ -package org.apache.lucene.search.highlight; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -import java.io.IOException; -import java.util.ArrayList; -import java.util.Collection; -import java.util.HashMap; -import java.util.HashSet; -import java.util.Iterator; -import java.util.List; -import java.util.Map; -import java.util.Set; - -import org.apache.lucene.analysis.CachingTokenFilter; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.Term; -import org.apache.lucene.index.memory.MemoryIndex; -import org.apache.lucene.search.*; -import org.apache.lucene.search.spans.FieldMaskingSpanQuery; -import org.apache.lucene.search.spans.SpanFirstQuery; -import org.apache.lucene.search.spans.SpanNearQuery; -import org.apache.lucene.search.spans.SpanNotQuery; -import org.apache.lucene.search.spans.SpanOrQuery; -import org.apache.lucene.search.spans.SpanQuery; -import org.apache.lucene.search.spans.SpanTermQuery; -import org.apache.lucene.search.spans.Spans; -import org.apache.lucene.util.StringHelper; - -/** - * Class used to extract {@link WeightedSpanTerm}s from a {@link Query} based on whether - * {@link Term}s from the {@link Query} are contained in a supplied {@link TokenStream}. - */ -public class WeightedSpanTermExtractor { - - private String fieldName; - private TokenStream tokenStream; - private Map readers = new HashMap(10); - private String defaultField; - private boolean expandMultiTermQuery; - private boolean cachedTokenStream; - private boolean wrapToCaching = true; - - public WeightedSpanTermExtractor() { - } - - public WeightedSpanTermExtractor(String defaultField) { - if (defaultField != null) { - this.defaultField = StringHelper.intern(defaultField); - } - } - - private void closeReaders() { - Collection readerSet = readers.values(); - - for (final IndexReader reader : readerSet) { - try { - reader.close(); - } catch (IOException e) { - // alert? - } - } - } - - /** - * Fills a Map with <@link WeightedSpanTerm>s using the terms from the supplied Query. - * - * @param query - * Query to extract Terms from - * @param terms - * Map to place created WeightedSpanTerms in - * @throws IOException - */ - private void extract(Query query, Map terms) throws IOException { - if (query instanceof BooleanQuery) { - BooleanClause[] queryClauses = ((BooleanQuery) query).getClauses(); - - for (int i = 0; i < queryClauses.length; i++) { - if (!queryClauses[i].isProhibited()) { - extract(queryClauses[i].getQuery(), terms); - } - } - } else if (query instanceof PhraseQuery) { - PhraseQuery phraseQuery = ((PhraseQuery) query); - Term[] phraseQueryTerms = phraseQuery.getTerms(); - SpanQuery[] clauses = new SpanQuery[phraseQueryTerms.length]; - for (int i = 0; i < phraseQueryTerms.length; i++) { - clauses[i] = new SpanTermQuery(phraseQueryTerms[i]); - } - int slop = phraseQuery.getSlop(); - int[] positions = phraseQuery.getPositions(); - // add largest position increment to slop - if (positions.length > 0) { - int lastPos = positions[0]; - int largestInc = 0; - int sz = positions.length; - for (int i = 1; i < sz; i++) { - int pos = positions[i]; - int inc = pos - lastPos; - if (inc > largestInc) { - largestInc = inc; - } - lastPos = pos; - } - if(largestInc > 1) { - slop += largestInc; - } - } - - boolean inorder = false; - - if (slop == 0) { - inorder = true; - } - - SpanNearQuery sp = new SpanNearQuery(clauses, slop, inorder); - sp.setBoost(query.getBoost()); - extractWeightedSpanTerms(terms, sp); - } else if (query instanceof TermQuery) { - extractWeightedTerms(terms, query); - } else if (query instanceof SpanQuery) { - extractWeightedSpanTerms(terms, (SpanQuery) query); - } else if (query instanceof FilteredQuery) { - extract(((FilteredQuery) query).getQuery(), terms); - } else if (query instanceof DisjunctionMaxQuery) { - for (Iterator iterator = ((DisjunctionMaxQuery) query).iterator(); iterator.hasNext();) { - extract(iterator.next(), terms); - } - } else if (query instanceof MultiTermQuery && expandMultiTermQuery) { - MultiTermQuery mtq = ((MultiTermQuery)query); - if(mtq.getRewriteMethod() != MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE) { - mtq = (MultiTermQuery) mtq.clone(); - mtq.setRewriteMethod(MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE); - query = mtq; - } - if (mtq.getField() != null) { - IndexReader ir = getReaderForField(mtq.getField()); - extract(query.rewrite(ir), terms); - } - // nocommit is this needed anymore? - /* - else { - FakeReader fReader = new FakeReader(); - MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE.rewrite(fReader, mtq); - if (fReader.field != null) { - IndexReader ir = getReaderForField(fReader.field); - extract(query.rewrite(ir), terms); - } - } - */ - } else if (query instanceof MultiPhraseQuery) { - final MultiPhraseQuery mpq = (MultiPhraseQuery) query; - final List termArrays = mpq.getTermArrays(); - final int[] positions = mpq.getPositions(); - if (positions.length > 0) { - - int maxPosition = positions[positions.length - 1]; - for (int i = 0; i < positions.length - 1; ++i) { - if (positions[i] > maxPosition) { - maxPosition = positions[i]; - } - } - - final List[] disjunctLists = new List[maxPosition + 1]; - int distinctPositions = 0; - - for (int i = 0; i < termArrays.size(); ++i) { - final Term[] termArray = termArrays.get(i); - List disjuncts = disjunctLists[positions[i]]; - if (disjuncts == null) { - disjuncts = (disjunctLists[positions[i]] = new ArrayList(termArray.length)); - ++distinctPositions; - } - for (int j = 0; j < termArray.length; ++j) { - disjuncts.add(new SpanTermQuery(termArray[j])); - } - } - - int positionGaps = 0; - int position = 0; - final SpanQuery[] clauses = new SpanQuery[distinctPositions]; - for (int i = 0; i < disjunctLists.length; ++i) { - List disjuncts = disjunctLists[i]; - if (disjuncts != null) { - clauses[position++] = new SpanOrQuery(disjuncts - .toArray(new SpanQuery[disjuncts.size()])); - } else { - ++positionGaps; - } - } - - final int slop = mpq.getSlop(); - final boolean inorder = (slop == 0); - - SpanNearQuery sp = new SpanNearQuery(clauses, slop + positionGaps, inorder); - sp.setBoost(query.getBoost()); - extractWeightedSpanTerms(terms, sp); - } - } - } - - /** - * Fills a Map with <@link WeightedSpanTerm>s using the terms from the supplied SpanQuery. - * - * @param terms - * Map to place created WeightedSpanTerms in - * @param spanQuery - * SpanQuery to extract Terms from - * @throws IOException - */ - private void extractWeightedSpanTerms(Map terms, SpanQuery spanQuery) throws IOException { - Set fieldNames; - - if (fieldName == null) { - fieldNames = new HashSet(); - collectSpanQueryFields(spanQuery, fieldNames); - } else { - fieldNames = new HashSet(1); - fieldNames.add(fieldName); - } - // To support the use of the default field name - if (defaultField != null) { - fieldNames.add(defaultField); - } - - Map queries = new HashMap(); - - Set nonWeightedTerms = new HashSet(); - final boolean mustRewriteQuery = mustRewriteQuery(spanQuery); - if (mustRewriteQuery) { - for (final String field : fieldNames) { - final SpanQuery rewrittenQuery = (SpanQuery) spanQuery.rewrite(getReaderForField(field)); - queries.put(field, rewrittenQuery); - rewrittenQuery.extractTerms(nonWeightedTerms); - } - } else { - spanQuery.extractTerms(nonWeightedTerms); - } - - List spanPositions = new ArrayList(); - - for (final String field : fieldNames) { - - IndexReader reader = getReaderForField(field); - final Spans spans; - if (mustRewriteQuery) { - spans = queries.get(field).getSpans(reader); - } else { - spans = spanQuery.getSpans(reader); - } - - - // collect span positions - while (spans.next()) { - spanPositions.add(new PositionSpan(spans.start(), spans.end() - 1)); - } - - } - - if (spanPositions.size() == 0) { - // no spans found - return; - } - - for (final Term queryTerm : nonWeightedTerms) { - - if (fieldNameComparator(queryTerm.field())) { - WeightedSpanTerm weightedSpanTerm = terms.get(queryTerm.text()); - - if (weightedSpanTerm == null) { - weightedSpanTerm = new WeightedSpanTerm(spanQuery.getBoost(), queryTerm.text()); - weightedSpanTerm.addPositionSpans(spanPositions); - weightedSpanTerm.positionSensitive = true; - terms.put(queryTerm.text(), weightedSpanTerm); - } else { - if (spanPositions.size() > 0) { - weightedSpanTerm.addPositionSpans(spanPositions); - } - } - } - } - } - - /** - * Fills a Map with <@link WeightedSpanTerm>s using the terms from the supplied Query. - * - * @param terms - * Map to place created WeightedSpanTerms in - * @param query - * Query to extract Terms from - * @throws IOException - */ - private void extractWeightedTerms(Map terms, Query query) throws IOException { - Set nonWeightedTerms = new HashSet(); - query.extractTerms(nonWeightedTerms); - - for (final Term queryTerm : nonWeightedTerms) { - - if (fieldNameComparator(queryTerm.field())) { - WeightedSpanTerm weightedSpanTerm = new WeightedSpanTerm(query.getBoost(), queryTerm.text()); - terms.put(queryTerm.text(), weightedSpanTerm); - } - } - } - - /** - * Necessary to implement matches for queries against defaultField - */ - private boolean fieldNameComparator(String fieldNameToCheck) { - boolean rv = fieldName == null || fieldNameToCheck == fieldName - || fieldNameToCheck == defaultField; - return rv; - } - - private IndexReader getReaderForField(String field) throws IOException { - if(wrapToCaching && !cachedTokenStream && !(tokenStream instanceof CachingTokenFilter)) { - tokenStream = new CachingTokenFilter(tokenStream); - cachedTokenStream = true; - } - IndexReader reader = readers.get(field); - if (reader == null) { - MemoryIndex indexer = new MemoryIndex(); - indexer.addField(field, tokenStream); - tokenStream.reset(); - IndexSearcher searcher = indexer.createSearcher(); - reader = searcher.getIndexReader(); - readers.put(field, reader); - } - - return reader; - } - - /** - * Creates a Map of WeightedSpanTerms from the given Query and TokenStream. - * - *

- * - * @param query - * that caused hit - * @param tokenStream - * of text to be highlighted - * @return Map containing WeightedSpanTerms - * @throws IOException - */ - public Map getWeightedSpanTerms(Query query, TokenStream tokenStream) - throws IOException { - return getWeightedSpanTerms(query, tokenStream, null); - } - - /** - * Creates a Map of WeightedSpanTerms from the given Query and TokenStream. - * - *

- * - * @param query - * that caused hit - * @param tokenStream - * of text to be highlighted - * @param fieldName - * restricts Term's used based on field name - * @return Map containing WeightedSpanTerms - * @throws IOException - */ - public Map getWeightedSpanTerms(Query query, TokenStream tokenStream, - String fieldName) throws IOException { - if (fieldName != null) { - this.fieldName = StringHelper.intern(fieldName); - } else { - this.fieldName = null; - } - - Map terms = new PositionCheckingMap(); - this.tokenStream = tokenStream; - try { - extract(query, terms); - } finally { - closeReaders(); - } - - return terms; - } - - /** - * Creates a Map of WeightedSpanTerms from the given Query and TokenStream. Uses a supplied - * IndexReader to properly weight terms (for gradient highlighting). - * - *

- * - * @param query - * that caused hit - * @param tokenStream - * of text to be highlighted - * @param fieldName - * restricts Term's used based on field name - * @param reader - * to use for scoring - * @return Map of WeightedSpanTerms with quasi tf/idf scores - * @throws IOException - */ - public Map getWeightedSpanTermsWithScores(Query query, TokenStream tokenStream, String fieldName, - IndexReader reader) throws IOException { - if (fieldName != null) { - this.fieldName = StringHelper.intern(fieldName); - } else { - this.fieldName = null; - } - this.tokenStream = tokenStream; - - Map terms = new PositionCheckingMap(); - extract(query, terms); - - int totalNumDocs = reader.numDocs(); - Set weightedTerms = terms.keySet(); - Iterator it = weightedTerms.iterator(); - - try { - while (it.hasNext()) { - WeightedSpanTerm weightedSpanTerm = terms.get(it.next()); - int docFreq = reader.docFreq(new Term(fieldName, weightedSpanTerm.term)); - // docFreq counts deletes - if(totalNumDocs < docFreq) { - docFreq = totalNumDocs; - } - // IDF algorithm taken from DefaultSimilarity class - float idf = (float) (Math.log((float) totalNumDocs / (double) (docFreq + 1)) + 1.0); - weightedSpanTerm.weight *= idf; - } - } finally { - - closeReaders(); - } - - return terms; - } - - private void collectSpanQueryFields(SpanQuery spanQuery, Set fieldNames) { - if (spanQuery instanceof FieldMaskingSpanQuery) { - collectSpanQueryFields(((FieldMaskingSpanQuery)spanQuery).getMaskedQuery(), fieldNames); - } else if (spanQuery instanceof SpanFirstQuery) { - collectSpanQueryFields(((SpanFirstQuery)spanQuery).getMatch(), fieldNames); - } else if (spanQuery instanceof SpanNearQuery) { - for (final SpanQuery clause : ((SpanNearQuery)spanQuery).getClauses()) { - collectSpanQueryFields(clause, fieldNames); - } - } else if (spanQuery instanceof SpanNotQuery) { - collectSpanQueryFields(((SpanNotQuery)spanQuery).getInclude(), fieldNames); - } else if (spanQuery instanceof SpanOrQuery) { - for (final SpanQuery clause : ((SpanOrQuery)spanQuery).getClauses()) { - collectSpanQueryFields(clause, fieldNames); - } - } else { - fieldNames.add(spanQuery.getField()); - } - } - - private boolean mustRewriteQuery(SpanQuery spanQuery) { - if (!expandMultiTermQuery) { - return false; // Will throw UnsupportedOperationException in case of a SpanRegexQuery. - } else if (spanQuery instanceof FieldMaskingSpanQuery) { - return mustRewriteQuery(((FieldMaskingSpanQuery)spanQuery).getMaskedQuery()); - } else if (spanQuery instanceof SpanFirstQuery) { - return mustRewriteQuery(((SpanFirstQuery)spanQuery).getMatch()); - } else if (spanQuery instanceof SpanNearQuery) { - for (final SpanQuery clause : ((SpanNearQuery)spanQuery).getClauses()) { - if (mustRewriteQuery(clause)) { - return true; - } - } - return false; - } else if (spanQuery instanceof SpanNotQuery) { - SpanNotQuery spanNotQuery = (SpanNotQuery)spanQuery; - return mustRewriteQuery(spanNotQuery.getInclude()) || mustRewriteQuery(spanNotQuery.getExclude()); - } else if (spanQuery instanceof SpanOrQuery) { - for (final SpanQuery clause : ((SpanOrQuery)spanQuery).getClauses()) { - if (mustRewriteQuery(clause)) { - return true; - } - } - return false; - } else if (spanQuery instanceof SpanTermQuery) { - return false; - } else { - return true; - } - } - - /** - * This class makes sure that if both position sensitive and insensitive - * versions of the same term are added, the position insensitive one wins. - */ - static private class PositionCheckingMap extends HashMap { - - @Override - public void putAll(Map m) { - Iterator> it = m.entrySet().iterator(); - while (it.hasNext()) { - Map.Entry entry = it.next(); - this.put(entry.getKey(), entry.getValue()); - } - } - - @Override - public WeightedSpanTerm put(K key, WeightedSpanTerm value) { - WeightedSpanTerm prev = super.put(key, value); - if (prev == null) return prev; - WeightedSpanTerm prevTerm = prev; - WeightedSpanTerm newTerm = value; - if (!prevTerm.positionSensitive) { - newTerm.positionSensitive = false; - } - return prev; - } - - } - - public boolean getExpandMultiTermQuery() { - return expandMultiTermQuery; - } - - public void setExpandMultiTermQuery(boolean expandMultiTermQuery) { - this.expandMultiTermQuery = expandMultiTermQuery; - } - - public boolean isCachedTokenStream() { - return cachedTokenStream; - } - - public TokenStream getTokenStream() { - return tokenStream; - } - - /** - * By default, {@link TokenStream}s that are not of the type - * {@link CachingTokenFilter} are wrapped in a {@link CachingTokenFilter} to - * ensure an efficient reset - if you are already using a different caching - * {@link TokenStream} impl and you don't want it to be wrapped, set this to - * false. - * - * @param wrap - */ - public void setWrapIfNotCachingTokenFilter(boolean wrap) { - this.wrapToCaching = wrap; - } -} Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java (revision 956773) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java (working copy) @@ -1,285 +0,0 @@ -/* - * Created on 28-Oct-2004 - */ -package org.apache.lucene.search.highlight; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; -import java.io.StringReader; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Comparator; - -import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.Token; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; -import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; -import org.apache.lucene.document.Document; -import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.TermFreqVector; -import org.apache.lucene.index.TermPositionVector; -import org.apache.lucene.index.TermVectorOffsetInfo; - -/** - * Hides implementation issues associated with obtaining a TokenStream for use - * with the higlighter - can obtain from TermFreqVectors with offsets and - * (optionally) positions or from Analyzer class reparsing the stored content. - */ -public class TokenSources { - /** - * A convenience method that tries to first get a TermPositionVector for the - * specified docId, then, falls back to using the passed in - * {@link org.apache.lucene.document.Document} to retrieve the TokenStream. - * This is useful when you already have the document, but would prefer to use - * the vector first. - * - * @param reader The {@link org.apache.lucene.index.IndexReader} to use to try - * and get the vector from - * @param docId The docId to retrieve. - * @param field The field to retrieve on the document - * @param doc The document to fall back on - * @param analyzer The analyzer to use for creating the TokenStream if the - * vector doesn't exist - * @return The {@link org.apache.lucene.analysis.TokenStream} for the - * {@link org.apache.lucene.document.Fieldable} on the - * {@link org.apache.lucene.document.Document} - * @throws IOException if there was an error loading - */ - public static TokenStream getAnyTokenStream(IndexReader reader, int docId, - String field, Document doc, Analyzer analyzer) throws IOException { - TokenStream ts = null; - - TermFreqVector tfv = reader.getTermFreqVector(docId, field); - if (tfv != null) { - if (tfv instanceof TermPositionVector) { - ts = getTokenStream((TermPositionVector) tfv); - } - } - // No token info stored so fall back to analyzing raw content - if (ts == null) { - ts = getTokenStream(doc, field, analyzer); - } - return ts; - } - - /** - * A convenience method that tries a number of approaches to getting a token - * stream. The cost of finding there are no termVectors in the index is - * minimal (1000 invocations still registers 0 ms). So this "lazy" (flexible?) - * approach to coding is probably acceptable - * - * @param reader - * @param docId - * @param field - * @param analyzer - * @return null if field not stored correctly - * @throws IOException - */ - public static TokenStream getAnyTokenStream(IndexReader reader, int docId, - String field, Analyzer analyzer) throws IOException { - TokenStream ts = null; - - TermFreqVector tfv = reader.getTermFreqVector(docId, field); - if (tfv != null) { - if (tfv instanceof TermPositionVector) { - ts = getTokenStream((TermPositionVector) tfv); - } - } - // No token info stored so fall back to analyzing raw content - if (ts == null) { - ts = getTokenStream(reader, docId, field, analyzer); - } - return ts; - } - - public static TokenStream getTokenStream(TermPositionVector tpv) { - // assumes the worst and makes no assumptions about token position - // sequences. - return getTokenStream(tpv, false); - } - - /** - * Low level api. Returns a token stream or null if no offset info available - * in index. This can be used to feed the highlighter with a pre-parsed token - * stream - * - * In my tests the speeds to recreate 1000 token streams using this method - * are: - with TermVector offset only data stored - 420 milliseconds - with - * TermVector offset AND position data stored - 271 milliseconds (nb timings - * for TermVector with position data are based on a tokenizer with contiguous - * positions - no overlaps or gaps) The cost of not using TermPositionVector - * to store pre-parsed content and using an analyzer to re-parse the original - * content: - reanalyzing the original content - 980 milliseconds - * - * The re-analyze timings will typically vary depending on - 1) The complexity - * of the analyzer code (timings above were using a - * stemmer/lowercaser/stopword combo) 2) The number of other fields (Lucene - * reads ALL fields off the disk when accessing just one document field - can - * cost dear!) 3) Use of compression on field storage - could be faster due to - * compression (less disk IO) or slower (more CPU burn) depending on the - * content. - * - * @param tpv - * @param tokenPositionsGuaranteedContiguous true if the token position - * numbers have no overlaps or gaps. If looking to eek out the last - * drops of performance, set to true. If in doubt, set to false. - */ - public static TokenStream getTokenStream(TermPositionVector tpv, - boolean tokenPositionsGuaranteedContiguous) { - if (!tokenPositionsGuaranteedContiguous && tpv.getTermPositions(0) != null) { - return new TokenStreamFromTermPositionVector(tpv); - } - - // an object used to iterate across an array of tokens - final class StoredTokenStream extends TokenStream { - Token tokens[]; - - int currentToken = 0; - - CharTermAttribute termAtt; - - OffsetAttribute offsetAtt; - - StoredTokenStream(Token tokens[]) { - this.tokens = tokens; - termAtt = addAttribute(CharTermAttribute.class); - offsetAtt = addAttribute(OffsetAttribute.class); - } - - @Override - public boolean incrementToken() throws IOException { - if (currentToken >= tokens.length) { - return false; - } - Token token = tokens[currentToken++]; - clearAttributes(); - termAtt.setEmpty().append(token); - offsetAtt.setOffset(token.startOffset(), token.endOffset()); - return true; - } - } - // code to reconstruct the original sequence of Tokens - String[] terms = tpv.getTerms(); - int[] freq = tpv.getTermFrequencies(); - int totalTokens = 0; - - for (int t = 0; t < freq.length; t++) { - totalTokens += freq[t]; - } - Token tokensInOriginalOrder[] = new Token[totalTokens]; - ArrayList unsortedTokens = null; - for (int t = 0; t < freq.length; t++) { - TermVectorOffsetInfo[] offsets = tpv.getOffsets(t); - if (offsets == null) { - throw new IllegalArgumentException("Required TermVector Offset information was not found"); - } - - int[] pos = null; - if (tokenPositionsGuaranteedContiguous) { - // try get the token position info to speed up assembly of tokens into - // sorted sequence - pos = tpv.getTermPositions(t); - } - if (pos == null) { - // tokens NOT stored with positions or not guaranteed contiguous - must - // add to list and sort later - if (unsortedTokens == null) { - unsortedTokens = new ArrayList(); - } - for (int tp = 0; tp < offsets.length; tp++) { - Token token = new Token(terms[t], offsets[tp].getStartOffset(), offsets[tp] - .getEndOffset()); - unsortedTokens.add(token); - } - } else { - // We have positions stored and a guarantee that the token position - // information is contiguous - - // This may be fast BUT wont work if Tokenizers used which create >1 - // token in same position or - // creates jumps in position numbers - this code would fail under those - // circumstances - - // tokens stored with positions - can use this to index straight into - // sorted array - for (int tp = 0; tp < pos.length; tp++) { - Token token = new Token(terms[t], offsets[tp].getStartOffset(), - offsets[tp].getEndOffset()); - tokensInOriginalOrder[pos[tp]] = token; - } - } - } - // If the field has been stored without position data we must perform a sort - if (unsortedTokens != null) { - tokensInOriginalOrder = unsortedTokens.toArray(new Token[unsortedTokens - .size()]); - Arrays.sort(tokensInOriginalOrder, new Comparator() { - public int compare(Token t1, Token t2) { - if (t1.startOffset() > t2.endOffset()) - return 1; - if (t1.startOffset() < t2.startOffset()) - return -1; - return 0; - } - }); - } - return new StoredTokenStream(tokensInOriginalOrder); - } - - public static TokenStream getTokenStream(IndexReader reader, int docId, - String field) throws IOException { - TermFreqVector tfv = reader.getTermFreqVector(docId, field); - if (tfv == null) { - throw new IllegalArgumentException(field + " in doc #" + docId - + "does not have any term position data stored"); - } - if (tfv instanceof TermPositionVector) { - TermPositionVector tpv = (TermPositionVector) reader.getTermFreqVector( - docId, field); - return getTokenStream(tpv); - } - throw new IllegalArgumentException(field + " in doc #" + docId - + "does not have any term position data stored"); - } - - // convenience method - public static TokenStream getTokenStream(IndexReader reader, int docId, - String field, Analyzer analyzer) throws IOException { - Document doc = reader.document(docId); - return getTokenStream(doc, field, analyzer); - } - - public static TokenStream getTokenStream(Document doc, String field, - Analyzer analyzer) { - String contents = doc.get(field); - if (contents == null) { - throw new IllegalArgumentException("Field " + field - + " in document is not stored and cannot be analyzed"); - } - return getTokenStream(field, contents, analyzer); - } - - // convenience method - public static TokenStream getTokenStream(String field, String contents, - Analyzer analyzer) { - return analyzer.tokenStream(field, new StringReader(contents)); - } - -} Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Fragmenter.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Fragmenter.java (revision 956773) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Fragmenter.java (working copy) @@ -1,46 +0,0 @@ -package org.apache.lucene.search.highlight; -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.lucene.analysis.TokenStream; - -/** - * Implements the policy for breaking text into multiple fragments for - * consideration by the {@link Highlighter} class. A sophisticated - * implementation may do this on the basis of detecting end of sentences in the - * text. - */ -public interface Fragmenter { - - /** - * Initializes the Fragmenter. You can grab references to the Attributes you are - * interested in from tokenStream and then access the values in {@link #isNewFragment()}. - * - * @param originalText the original source text - * @param tokenStream the {@link TokenStream} to be fragmented - */ - public void start(String originalText, TokenStream tokenStream); - - - /** - * Test to see if this token from the stream should be held in a new - * TextFragment. Every time this is called, the TokenStream - * passed to start(String, TokenStream) will have been incremented. - * - */ - public boolean isNewFragment(); -} Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/InvalidTokenOffsetsException.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/InvalidTokenOffsetsException.java (revision 956773) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/InvalidTokenOffsetsException.java (working copy) @@ -1,31 +0,0 @@ -package org.apache.lucene.search.highlight; -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * Exception thrown if TokenStream Tokens are incompatible with provided text - * - */ -public class InvalidTokenOffsetsException extends Exception -{ - - public InvalidTokenOffsetsException(String message) - { - super(message); - } - -} Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/GradientFormatter.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/GradientFormatter.java (revision 956773) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/GradientFormatter.java (working copy) @@ -1,227 +0,0 @@ -package org.apache.lucene.search.highlight; -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * Formats text with different color intensity depending on the score of the - * term. - * - */ -public class GradientFormatter implements Formatter -{ - private float maxScore; - - int fgRMin, fgGMin, fgBMin; - - int fgRMax, fgGMax, fgBMax; - - protected boolean highlightForeground; - - int bgRMin, bgGMin, bgBMin; - - int bgRMax, bgGMax, bgBMax; - - protected boolean highlightBackground; - - /** - * Sets the color range for the IDF scores - * - * @param maxScore - * The score (and above) displayed as maxColor (See QueryScorer.getMaxWeight - * which can be used to calibrate scoring scale) - * @param minForegroundColor - * The hex color used for representing IDF scores of zero eg - * #FFFFFF (white) or null if no foreground color required - * @param maxForegroundColor - * The largest hex color used for representing IDF scores eg - * #000000 (black) or null if no foreground color required - * @param minBackgroundColor - * The hex color used for representing IDF scores of zero eg - * #FFFFFF (white) or null if no background color required - * @param maxBackgroundColor - * The largest hex color used for representing IDF scores eg - * #000000 (black) or null if no background color required - */ - public GradientFormatter(float maxScore, String minForegroundColor, - String maxForegroundColor, String minBackgroundColor, - String maxBackgroundColor) - { - highlightForeground = (minForegroundColor != null) - && (maxForegroundColor != null); - if (highlightForeground) - { - if (minForegroundColor.length() != 7) - { - throw new IllegalArgumentException( - "minForegroundColor is not 7 bytes long eg a hex " - + "RGB value such as #FFFFFF"); - } - if (maxForegroundColor.length() != 7) - { - throw new IllegalArgumentException( - "minForegroundColor is not 7 bytes long eg a hex " - + "RGB value such as #FFFFFF"); - } - fgRMin = hexToInt(minForegroundColor.substring(1, 3)); - fgGMin = hexToInt(minForegroundColor.substring(3, 5)); - fgBMin = hexToInt(minForegroundColor.substring(5, 7)); - - fgRMax = hexToInt(maxForegroundColor.substring(1, 3)); - fgGMax = hexToInt(maxForegroundColor.substring(3, 5)); - fgBMax = hexToInt(maxForegroundColor.substring(5, 7)); - } - - highlightBackground = (minBackgroundColor != null) - && (maxBackgroundColor != null); - if (highlightBackground) - { - if (minBackgroundColor.length() != 7) - { - throw new IllegalArgumentException( - "minBackgroundColor is not 7 bytes long eg a hex " - + "RGB value such as #FFFFFF"); - } - if (maxBackgroundColor.length() != 7) - { - throw new IllegalArgumentException( - "minBackgroundColor is not 7 bytes long eg a hex " - + "RGB value such as #FFFFFF"); - } - bgRMin = hexToInt(minBackgroundColor.substring(1, 3)); - bgGMin = hexToInt(minBackgroundColor.substring(3, 5)); - bgBMin = hexToInt(minBackgroundColor.substring(5, 7)); - - bgRMax = hexToInt(maxBackgroundColor.substring(1, 3)); - bgGMax = hexToInt(maxBackgroundColor.substring(3, 5)); - bgBMax = hexToInt(maxBackgroundColor.substring(5, 7)); - } - // this.corpusReader = corpusReader; - this.maxScore = maxScore; - // totalNumDocs = corpusReader.numDocs(); - } - - public String highlightTerm(String originalText, TokenGroup tokenGroup) - { - if (tokenGroup.getTotalScore() == 0) - return originalText; - float score = tokenGroup.getTotalScore(); - if (score == 0) - { - return originalText; - } - StringBuilder sb = new StringBuilder(); - sb.append(""); - sb.append(originalText); - sb.append(""); - return sb.toString(); - } - - protected String getForegroundColorString(float score) - { - int rVal = getColorVal(fgRMin, fgRMax, score); - int gVal = getColorVal(fgGMin, fgGMax, score); - int bVal = getColorVal(fgBMin, fgBMax, score); - StringBuilder sb = new StringBuilder(); - sb.append("#"); - sb.append(intToHex(rVal)); - sb.append(intToHex(gVal)); - sb.append(intToHex(bVal)); - return sb.toString(); - } - - protected String getBackgroundColorString(float score) - { - int rVal = getColorVal(bgRMin, bgRMax, score); - int gVal = getColorVal(bgGMin, bgGMax, score); - int bVal = getColorVal(bgBMin, bgBMax, score); - StringBuilder sb = new StringBuilder(); - sb.append("#"); - sb.append(intToHex(rVal)); - sb.append(intToHex(gVal)); - sb.append(intToHex(bVal)); - return sb.toString(); - } - - private int getColorVal(int colorMin, int colorMax, float score) - { - if (colorMin == colorMax) - { - return colorMin; - } - float scale = Math.abs(colorMin - colorMax); - float relScorePercent = Math.min(maxScore, score) / maxScore; - float colScore = scale * relScorePercent; - return Math.min(colorMin, colorMax) + (int) colScore; - } - - private static char hexDigits[] = { '0', '1', '2', '3', '4', '5', '6', '7', - '8', '9', 'A', 'B', 'C', 'D', 'E', 'F' }; - - private static String intToHex(int i) - { - return "" + hexDigits[(i & 0xF0) >> 4] + hexDigits[i & 0x0F]; - } - - /** - * Converts a hex string into an int. Integer.parseInt(hex, 16) assumes the - * input is nonnegative unless there is a preceding minus sign. This method - * reads the input as twos complement instead, so if the input is 8 bytes - * long, it will correctly restore a negative int produced by - * Integer.toHexString() but not necessarily one produced by - * Integer.toString(x,16) since that method will produce a string like '-FF' - * for negative integer values. - * - * @param hex - * A string in capital or lower case hex, of no more then 16 - * characters. - * @throws NumberFormatException - * if the string is more than 16 characters long, or if any - * character is not in the set [0-9a-fA-f] - */ - public static final int hexToInt(String hex) - { - int len = hex.length(); - if (len > 16) - throw new NumberFormatException(); - - int l = 0; - for (int i = 0; i < len; i++) - { - l <<= 4; - int c = Character.digit(hex.charAt(i), 16); - if (c < 0) - throw new NumberFormatException(); - l |= c; - } - return l; - } - -} - Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenStreamFromTermPositionVector.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenStreamFromTermPositionVector.java (revision 956773) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenStreamFromTermPositionVector.java (working copy) @@ -1,116 +0,0 @@ -package org.apache.lucene.search.highlight; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -import java.io.IOException; -import java.util.ArrayList; -import java.util.Collections; -import java.util.Comparator; -import java.util.Iterator; -import java.util.List; - -import org.apache.lucene.analysis.Token; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; -import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; -import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; -import org.apache.lucene.index.TermPositionVector; -import org.apache.lucene.index.TermVectorOffsetInfo; - -public final class TokenStreamFromTermPositionVector extends TokenStream { - - private final List positionedTokens = new ArrayList(); - - private Iterator tokensAtCurrentPosition; - - private CharTermAttribute termAttribute; - - private PositionIncrementAttribute positionIncrementAttribute; - - private OffsetAttribute offsetAttribute; - - /** - * Constructor. - * - * @param termPositionVector TermPositionVector that contains the data for - * creating the TokenStream. Must have positions and offsets. - */ - public TokenStreamFromTermPositionVector( - final TermPositionVector termPositionVector) { - termAttribute = addAttribute(CharTermAttribute.class); - positionIncrementAttribute = addAttribute(PositionIncrementAttribute.class); - offsetAttribute = addAttribute(OffsetAttribute.class); - final String[] terms = termPositionVector.getTerms(); - for (int i = 0; i < terms.length; i++) { - final TermVectorOffsetInfo[] offsets = termPositionVector.getOffsets(i); - final int[] termPositions = termPositionVector.getTermPositions(i); - for (int j = 0; j < termPositions.length; j++) { - Token token; - if (offsets != null) { - token = new Token(terms[i].toCharArray(), 0, terms[i].length(), - offsets[j].getStartOffset(), offsets[j].getEndOffset()); - } else { - token = new Token(); - token.setEmpty().append(terms[i]); - } - // Yes - this is the position, not the increment! This is for - // sorting. This value - // will be corrected before use. - token.setPositionIncrement(termPositions[j]); - this.positionedTokens.add(token); - } - } - final Comparator tokenComparator = new Comparator() { - public int compare(final Token o1, final Token o2) { - if (o1.getPositionIncrement() < o2.getPositionIncrement()) { - return -1; - } - if (o1.getPositionIncrement() > o2.getPositionIncrement()) { - return 1; - } - return 0; - } - }; - Collections.sort(this.positionedTokens, tokenComparator); - int lastPosition = -1; - for (final Token token : this.positionedTokens) { - int thisPosition = token.getPositionIncrement(); - token.setPositionIncrement(thisPosition - lastPosition); - lastPosition = thisPosition; - } - this.tokensAtCurrentPosition = this.positionedTokens.iterator(); - } - - @Override - public boolean incrementToken() throws IOException { - if (this.tokensAtCurrentPosition.hasNext()) { - final Token next = this.tokensAtCurrentPosition.next(); - clearAttributes(); - termAttribute.setEmpty().append(next); - positionIncrementAttribute.setPositionIncrement(next - .getPositionIncrement()); - offsetAttribute.setOffset(next.startOffset(), next.endOffset()); - return true; - } - return false; - } - - @Override - public void reset() throws IOException { - this.tokensAtCurrentPosition = this.positionedTokens.iterator(); - } -} Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Encoder.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Encoder.java (revision 956773) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Encoder.java (working copy) @@ -1,29 +0,0 @@ -package org.apache.lucene.search.highlight; -/** - * Copyright 2005 The Apache Software Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - -/** - * Encodes original text. The Encoder works with the {@link Formatter} to generate output. - * - */ -public interface Encoder -{ - /** - * @param originalText The section of text being output - */ - String encodeText(String originalText); -} \ No newline at end of file Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTerm.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTerm.java (revision 956773) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTerm.java (working copy) @@ -1,104 +0,0 @@ -package org.apache.lucene.search.highlight; - - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -import java.util.ArrayList; -import java.util.Iterator; -import java.util.List; - - -/** - * Lightweight class to hold term, weight, and positions used for scoring this - * term. - */ -public class WeightedSpanTerm extends WeightedTerm{ - boolean positionSensitive; - private List positionSpans = new ArrayList(); - - /** - * @param weight - * @param term - */ - public WeightedSpanTerm(float weight, String term) { - super(weight, term); - this.positionSpans = new ArrayList(); - } - - /** - * @param weight - * @param term - * @param positionSensitive - */ - public WeightedSpanTerm(float weight, String term, boolean positionSensitive) { - super(weight, term); - this.positionSensitive = positionSensitive; - } - - /** - * Checks to see if this term is valid at position. - * - * @param position - * to check against valid term positions - * @return true iff this term is a hit at this position - */ - public boolean checkPosition(int position) { - // There would probably be a slight speed improvement if PositionSpans - // where kept in some sort of priority queue - that way this method - // could - // bail early without checking each PositionSpan. - Iterator positionSpanIt = positionSpans.iterator(); - - while (positionSpanIt.hasNext()) { - PositionSpan posSpan = positionSpanIt.next(); - - if (((position >= posSpan.start) && (position <= posSpan.end))) { - return true; - } - } - - return false; - } - - public void addPositionSpans(List positionSpans) { - this.positionSpans.addAll(positionSpans); - } - - public boolean isPositionSensitive() { - return positionSensitive; - } - - public void setPositionSensitive(boolean positionSensitive) { - this.positionSensitive = positionSensitive; - } - - public List getPositionSpans() { - return positionSpans; - } -} - - -// Utility class to store a Span -class PositionSpan { - int start; - int end; - - public PositionSpan(int start, int end) { - this.start = start; - this.end = end; - } -} Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/QueryTermExtractor.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/QueryTermExtractor.java (revision 956773) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/QueryTermExtractor.java (working copy) @@ -1,172 +0,0 @@ -package org.apache.lucene.search.highlight; -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; -import java.util.HashSet; -import java.util.Iterator; - -import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.Term; -import org.apache.lucene.search.BooleanClause; -import org.apache.lucene.search.BooleanQuery; -import org.apache.lucene.search.FilteredQuery; -import org.apache.lucene.search.Query; -import org.apache.lucene.util.StringHelper; - -/** - * Utility class used to extract the terms used in a query, plus any weights. - * This class will not find terms for MultiTermQuery, TermRangeQuery and PrefixQuery classes - * so the caller must pass a rewritten query (see Query.rewrite) to obtain a list of - * expanded terms. - * - */ -public final class QueryTermExtractor -{ - - /** - * Extracts all terms texts of a given Query into an array of WeightedTerms - * - * @param query Query to extract term texts from - * @return an array of the terms used in a query, plus their weights. - */ - public static final WeightedTerm[] getTerms(Query query) - { - return getTerms(query,false); - } - - /** - * Extracts all terms texts of a given Query into an array of WeightedTerms - * - * @param query Query to extract term texts from - * @param reader used to compute IDF which can be used to a) score selected fragments better - * b) use graded highlights eg changing intensity of font color - * @param fieldName the field on which Inverse Document Frequency (IDF) calculations are based - * @return an array of the terms used in a query, plus their weights. - */ - public static final WeightedTerm[] getIdfWeightedTerms(Query query, IndexReader reader, String fieldName) - { - WeightedTerm[] terms=getTerms(query,false, fieldName); - int totalNumDocs=reader.numDocs(); - for (int i = 0; i < terms.length; i++) - { - try - { - int docFreq=reader.docFreq(new Term(fieldName,terms[i].term)); - // docFreq counts deletes - if(totalNumDocs < docFreq) { - docFreq = totalNumDocs; - } - //IDF algorithm taken from DefaultSimilarity class - float idf=(float)(Math.log((float)totalNumDocs/(double)(docFreq+1)) + 1.0); - terms[i].weight*=idf; - } - catch (IOException e) - { - //ignore - } - } - return terms; - } - - /** - * Extracts all terms texts of a given Query into an array of WeightedTerms - * - * @param query Query to extract term texts from - * @param prohibited true to extract "prohibited" terms, too - * @param fieldName The fieldName used to filter query terms - * @return an array of the terms used in a query, plus their weights. - */ - public static final WeightedTerm[] getTerms(Query query, boolean prohibited, String fieldName) - { - HashSet terms=new HashSet(); - if(fieldName!=null) - { - fieldName= StringHelper.intern(fieldName); - } - getTerms(query,terms,prohibited,fieldName); - return terms.toArray(new WeightedTerm[0]); - } - - /** - * Extracts all terms texts of a given Query into an array of WeightedTerms - * - * @param query Query to extract term texts from - * @param prohibited true to extract "prohibited" terms, too - * @return an array of the terms used in a query, plus their weights. - */ - public static final WeightedTerm[] getTerms(Query query, boolean prohibited) - { - return getTerms(query,prohibited,null); - } - - //fieldname MUST be interned prior to this call - private static final void getTerms(Query query, HashSet terms,boolean prohibited, String fieldName) - { - try - { - if (query instanceof BooleanQuery) - getTermsFromBooleanQuery((BooleanQuery) query, terms, prohibited, fieldName); - else - if(query instanceof FilteredQuery) - getTermsFromFilteredQuery((FilteredQuery)query, terms,prohibited, fieldName); - else - { - HashSet nonWeightedTerms=new HashSet(); - query.extractTerms(nonWeightedTerms); - for (Iterator iter = nonWeightedTerms.iterator(); iter.hasNext();) - { - Term term = iter.next(); - if((fieldName==null)||(term.field()==fieldName)) - { - terms.add(new WeightedTerm(query.getBoost(),term.text())); - } - } - } - } - catch(UnsupportedOperationException ignore) - { - //this is non-fatal for our purposes - } - } - - /** - * extractTerms is currently the only query-independent means of introspecting queries but it only reveals - * a list of terms for that query - not the boosts each individual term in that query may or may not have. - * "Container" queries such as BooleanQuery should be unwrapped to get at the boost info held - * in each child element. - * Some discussion around this topic here: - * http://www.gossamer-threads.com/lists/lucene/java-dev/34208?search_string=introspection;#34208 - * Unfortunately there seemed to be limited interest in requiring all Query objects to implement - * something common which would allow access to child queries so what follows here are query-specific - * implementations for accessing embedded query elements. - */ - private static final void getTermsFromBooleanQuery(BooleanQuery query, HashSet terms, boolean prohibited, String fieldName) - { - BooleanClause[] queryClauses = query.getClauses(); - for (int i = 0; i < queryClauses.length; i++) - { - if (prohibited || queryClauses[i].getOccur()!=BooleanClause.Occur.MUST_NOT) - getTerms(queryClauses[i].getQuery(), terms, prohibited, fieldName); - } - } - private static void getTermsFromFilteredQuery(FilteredQuery query, HashSet terms, boolean prohibited, String fieldName) - { - getTerms(query.getQuery(),terms,prohibited,fieldName); - } - -} Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TextFragment.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TextFragment.java (revision 956773) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TextFragment.java (working copy) @@ -1,91 +0,0 @@ -package org.apache.lucene.search.highlight; -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - -/** - * Low-level class used to record information about a section of a document - * with a score. - * - * - */ -public class TextFragment -{ - CharSequence markedUpText; - int fragNum; - int textStartPos; - int textEndPos; - float score; - - public TextFragment(CharSequence markedUpText,int textStartPos, int fragNum) - { - this.markedUpText=markedUpText; - this.textStartPos = textStartPos; - this.fragNum = fragNum; - } - /** - * @deprecated Use {@link #TextFragment(CharSequence, int, int)} instead. - * This constructor will be removed in Lucene 4.0 - */ - @Deprecated - public TextFragment(StringBuffer markedUpText,int textStartPos, int fragNum) - { - this.markedUpText=markedUpText; - this.textStartPos = textStartPos; - this.fragNum = fragNum; - } - void setScore(float score) - { - this.score=score; - } - public float getScore() - { - return score; - } - /** - * @param frag2 Fragment to be merged into this one - */ - public void merge(TextFragment frag2) - { - textEndPos = frag2.textEndPos; - score=Math.max(score,frag2.score); - } - /** - * @param fragment - * @return true if this fragment follows the one passed - */ - public boolean follows(TextFragment fragment) - { - return textStartPos == fragment.textEndPos; - } - - /** - * @return the fragment sequence number - */ - public int getFragNum() - { - return fragNum; - } - - /* Returns the marked-up text for this text fragment - */ - @Override - public String toString() { - return markedUpText.subSequence(textStartPos, textEndPos).toString(); - } - -} Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/package.html =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/package.html (revision 956773) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/package.html (working copy) @@ -1,99 +0,0 @@ - - - - - -The highlight package contains classes to provide "keyword in context" features -typically used to highlight search terms in the text of results pages. -The Highlighter class is the central component and can be used to extract the -most interesting sections of a piece of text and highlight them, with the help of -Fragmenter, fragment Scorer, and Formatter classes. - -

Example Usage

- -
-  //... Above, create documents with two fields, one with term vectors (tv) and one without (notv)
-  IndexSearcher searcher = new IndexSearcher(directory);
-  QueryParser parser = new QueryParser("notv", analyzer);
-  Query query = parser.parse("million");
-
-  TopDocs hits = searcher.search(query, 10);
-
-  SimpleHTMLFormatter htmlFormatter = new SimpleHTMLFormatter();
-  Highlighter highlighter = new Highlighter(htmlFormatter, new QueryScorer(query));
-  for (int i = 0; i < 10; i++) {
-    int id = hits.scoreDocs[i].doc;
-    Document doc = searcher.doc(id);
-    String text = doc.get("notv");
-    TokenStream tokenStream = TokenSources.getAnyTokenStream(searcher.getIndexReader(), id, "notv", analyzer);
-    TextFragment[] frag = highlighter.getBestTextFragments(tokenStream, text, false, 10);//highlighter.getBestFragments(tokenStream, text, 3, "...");
-    for (int j = 0; j < frag.length; j++) {
-      if ((frag[j] != null) && (frag[j].getScore() > 0)) {
-        System.out.println((frag[j].toString()));
-      }
-    }
-    //Term vector
-    text = doc.get("tv");
-    tokenStream = TokenSources.getAnyTokenStream(searcher.getIndexReader(), hits.scoreDocs[i].doc, "tv", analyzer);
-    frag = highlighter.getBestTextFragments(tokenStream, text, false, 10);
-    for (int j = 0; j < frag.length; j++) {
-      if ((frag[j] != null) && (frag[j].getScore() > 0)) {
-        System.out.println((frag[j].toString()));
-      }
-    }
-    System.out.println("-------------");
-  }
-
- -

New features 06/02/2005

- -This release adds options for encoding (thanks to Nicko Cadell). -An "Encoder" implementation such as the new SimpleHTMLEncoder class can be passed to the highlighter to encode -all those non-xhtml standard characters such as & into legal values. This simple class may not suffice for -some languages - Commons Lang has an implementation that could be used: escapeHtml(String) in -http://svn.apache.org/viewcvs.cgi/jakarta/commons/proper/lang/trunk/src/java/org/apache/commons/lang/StringEscapeUtils.java?rev=137958&view=markup - -

New features 22/12/2004

- -This release adds some new capabilities: -
    -
  1. Faster highlighting using Term vector support
  2. -
  3. New formatting options to use color intensity to show informational value
  4. -
  5. Options for better summarization by using term IDF scores to influence fragment selection
  6. -
- -

-The highlighter takes a TokenStream as input. Until now these streams have typically been produced -using an Analyzer but the new class TokenSources provides helper methods for obtaining TokenStreams from -the new TermVector position support (see latest CVS version).

- -

The new class GradientFormatter can use a scale of colors to highlight terms according to their score. -A subtle use of color can help emphasise the reasons for matching (useful when doing "MoreLikeThis" queries and -you want to see what the basis of the similarities are).

- -

The QueryScorer class has a new constructor which can use an IndexReader to derive the IDF (inverse document frequency) -for each term in order to influence the score. This is useful for helping to extracting the most significant sections -of a document and in supplying scores used by the new GradientFormatter to color significant words more strongly. -The QueryScorer.getMaxWeight method is useful when passed to the GradientFormatter constructor to define the top score -which is associated with the top color.

- - - - - - \ No newline at end of file Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/QueryScorer.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/QueryScorer.java (revision 956773) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/QueryScorer.java (working copy) @@ -1,268 +0,0 @@ -package org.apache.lucene.search.highlight; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; -import java.util.HashMap; -import java.util.HashSet; -import java.util.Map; -import java.util.Set; - -import org.apache.lucene.analysis.CachingTokenFilter; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; -import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; -import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.memory.MemoryIndex; -import org.apache.lucene.search.Query; -import org.apache.lucene.search.spans.SpanQuery; -import org.apache.lucene.util.StringHelper; - -/** - * {@link Scorer} implementation which scores text fragments by the number of - * unique query terms found. This class converts appropriate {@link Query}s to - * {@link SpanQuery}s and attempts to score only those terms that participated in - * generating the 'hit' on the document. - */ -public class QueryScorer implements Scorer { - private float totalScore; - private Set foundTerms; - private Map fieldWeightedSpanTerms; - private float maxTermWeight; - private int position = -1; - private String defaultField; - private CharTermAttribute termAtt; - private PositionIncrementAttribute posIncAtt; - private boolean expandMultiTermQuery = true; - private Query query; - private String field; - private IndexReader reader; - private boolean skipInitExtractor; - private boolean wrapToCaching = true; - - /** - * @param query Query to use for highlighting - */ - public QueryScorer(Query query) { - init(query, null, null, true); - } - - /** - * @param query Query to use for highlighting - * @param field Field to highlight - pass null to ignore fields - */ - public QueryScorer(Query query, String field) { - init(query, field, null, true); - } - - /** - * @param query Query to use for highlighting - * @param field Field to highlight - pass null to ignore fields - * @param reader {@link IndexReader} to use for quasi tf/idf scoring - */ - public QueryScorer(Query query, IndexReader reader, String field) { - init(query, field, reader, true); - } - - - /** - * @param query to use for highlighting - * @param reader {@link IndexReader} to use for quasi tf/idf scoring - * @param field to highlight - pass null to ignore fields - * @param defaultField - */ - public QueryScorer(Query query, IndexReader reader, String field, String defaultField) { - this.defaultField = StringHelper.intern(defaultField); - init(query, field, reader, true); - } - - /** - * @param defaultField - The default field for queries with the field name unspecified - */ - public QueryScorer(Query query, String field, String defaultField) { - this.defaultField = StringHelper.intern(defaultField); - init(query, field, null, true); - } - - /** - * @param weightedTerms an array of pre-created {@link WeightedSpanTerm}s - */ - public QueryScorer(WeightedSpanTerm[] weightedTerms) { - this.fieldWeightedSpanTerms = new HashMap(weightedTerms.length); - - for (int i = 0; i < weightedTerms.length; i++) { - WeightedSpanTerm existingTerm = fieldWeightedSpanTerms.get(weightedTerms[i].term); - - if ((existingTerm == null) || - (existingTerm.weight < weightedTerms[i].weight)) { - // if a term is defined more than once, always use the highest - // scoring weight - fieldWeightedSpanTerms.put(weightedTerms[i].term, weightedTerms[i]); - maxTermWeight = Math.max(maxTermWeight, weightedTerms[i].getWeight()); - } - } - skipInitExtractor = true; - } - - /* - * (non-Javadoc) - * - * @see org.apache.lucene.search.highlight.Scorer#getFragmentScore() - */ - public float getFragmentScore() { - return totalScore; - } - - /** - * - * @return The highest weighted term (useful for passing to - * GradientFormatter to set top end of coloring scale). - */ - public float getMaxTermWeight() { - return maxTermWeight; - } - - /* - * (non-Javadoc) - * - * @see org.apache.lucene.search.highlight.Scorer#getTokenScore(org.apache.lucene.analysis.Token, - * int) - */ - public float getTokenScore() { - position += posIncAtt.getPositionIncrement(); - String termText = termAtt.toString(); - - WeightedSpanTerm weightedSpanTerm; - - if ((weightedSpanTerm = fieldWeightedSpanTerms.get( - termText)) == null) { - return 0; - } - - if (weightedSpanTerm.positionSensitive && - !weightedSpanTerm.checkPosition(position)) { - return 0; - } - - float score = weightedSpanTerm.getWeight(); - - // found a query term - is it unique in this doc? - if (!foundTerms.contains(termText)) { - totalScore += score; - foundTerms.add(termText); - } - - return score; - } - - /* (non-Javadoc) - * @see org.apache.lucene.search.highlight.Scorer#init(org.apache.lucene.analysis.TokenStream) - */ - public TokenStream init(TokenStream tokenStream) throws IOException { - position = -1; - termAtt = tokenStream.addAttribute(CharTermAttribute.class); - posIncAtt = tokenStream.addAttribute(PositionIncrementAttribute.class); - if(!skipInitExtractor) { - if(fieldWeightedSpanTerms != null) { - fieldWeightedSpanTerms.clear(); - } - return initExtractor(tokenStream); - } - return null; - } - - /** - * Retrieve the {@link WeightedSpanTerm} for the specified token. Useful for passing - * Span information to a {@link Fragmenter}. - * - * @param token to get {@link WeightedSpanTerm} for - * @return WeightedSpanTerm for token - */ - public WeightedSpanTerm getWeightedSpanTerm(String token) { - return fieldWeightedSpanTerms.get(token); - } - - /** - */ - private void init(Query query, String field, IndexReader reader, boolean expandMultiTermQuery) { - this.reader = reader; - this.expandMultiTermQuery = expandMultiTermQuery; - this.query = query; - this.field = field; - } - - private TokenStream initExtractor(TokenStream tokenStream) throws IOException { - WeightedSpanTermExtractor qse = defaultField == null ? new WeightedSpanTermExtractor() - : new WeightedSpanTermExtractor(defaultField); - - qse.setExpandMultiTermQuery(expandMultiTermQuery); - qse.setWrapIfNotCachingTokenFilter(wrapToCaching); - if (reader == null) { - this.fieldWeightedSpanTerms = qse.getWeightedSpanTerms(query, - tokenStream, field); - } else { - this.fieldWeightedSpanTerms = qse.getWeightedSpanTermsWithScores(query, - tokenStream, field, reader); - } - if(qse.isCachedTokenStream()) { - return qse.getTokenStream(); - } - - return null; - } - - /* - * (non-Javadoc) - * - * @see org.apache.lucene.search.highlight.Scorer#startFragment(org.apache.lucene.search.highlight.TextFragment) - */ - public void startFragment(TextFragment newFragment) { - foundTerms = new HashSet(); - totalScore = 0; - } - - /** - * @return true if multi-term queries should be expanded - */ - public boolean isExpandMultiTermQuery() { - return expandMultiTermQuery; - } - - /** - * Controls whether or not multi-term queries are expanded - * against a {@link MemoryIndex} {@link IndexReader}. - * - * @param expandMultiTermQuery true if multi-term queries should be expanded - */ - public void setExpandMultiTermQuery(boolean expandMultiTermQuery) { - this.expandMultiTermQuery = expandMultiTermQuery; - } - - /** - * By default, {@link TokenStream}s that are not of the type - * {@link CachingTokenFilter} are wrapped in a {@link CachingTokenFilter} to - * ensure an efficient reset - if you are already using a different caching - * {@link TokenStream} impl and you don't want it to be wrapped, set this to - * false. - * - * @param wrap - */ - public void setWrapIfNotCachingTokenFilter(boolean wrap) { - this.wrapToCaching = wrap; - } -} Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/SimpleHTMLEncoder.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/SimpleHTMLEncoder.java (revision 956773) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/SimpleHTMLEncoder.java (working copy) @@ -1,81 +0,0 @@ -package org.apache.lucene.search.highlight; -/** - * Copyright 2005 The Apache Software Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * Simple {@link Encoder} implementation to escape text for HTML output - * - */ -public class SimpleHTMLEncoder implements Encoder -{ - public SimpleHTMLEncoder() - { - } - - public String encodeText(String originalText) - { - return htmlEncode(originalText); - } - - /** - * Encode string into HTML - */ - public final static String htmlEncode(String plainText) - { - if (plainText == null || plainText.length() == 0) - { - return ""; - } - - StringBuilder result = new StringBuilder(plainText.length()); - - for (int index=0; index': - result.append(">"); - break; - - default: - if (ch < 128) - { - result.append(ch); - } - else - { - result.append("&#").append((int)ch).append(";"); - } - } - } - - return result.toString(); - } -} \ No newline at end of file Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/DefaultEncoder.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/DefaultEncoder.java (revision 956773) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/DefaultEncoder.java (working copy) @@ -1,32 +0,0 @@ -package org.apache.lucene.search.highlight; -/** - * Copyright 2005 The Apache Software Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * Simple {@link Encoder} implementation that does not modify the output - * - */ -public class DefaultEncoder implements Encoder -{ - public DefaultEncoder() - { - } - - public String encodeText(String originalText) - { - return originalText; - } -} \ No newline at end of file Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/WeightedTerm.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/WeightedTerm.java (revision 956773) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/WeightedTerm.java (working copy) @@ -1,64 +0,0 @@ -package org.apache.lucene.search.highlight; -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** Lightweight class to hold term and a weight value used for scoring this term - */ -public class WeightedTerm -{ - float weight; // multiplier - String term; //stemmed form - public WeightedTerm (float weight,String term) - { - this.weight=weight; - this.term=term; - } - - - /** - * @return the term value (stemmed) - */ - public String getTerm() - { - return term; - } - - /** - * @return the weight associated with this term - */ - public float getWeight() - { - return weight; - } - - /** - * @param term the term value (stemmed) - */ - public void setTerm(String term) - { - this.term = term; - } - - /** - * @param weight the weight associated with this term - */ - public void setWeight(float weight) - { - this.weight = weight; - } - -} Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Scorer.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Scorer.java (revision 956773) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Scorer.java (working copy) @@ -1,67 +0,0 @@ -package org.apache.lucene.search.highlight; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; - -import org.apache.lucene.analysis.TokenStream; - -/** - * A Scorer is responsible for scoring a stream of tokens. These token scores - * can then be used to compute {@link TextFragment} scores. - */ -public interface Scorer { - - /** - * Called to init the Scorer with a {@link TokenStream}. You can grab references to - * the attributes you are interested in here and access them from {@link #getTokenScore()}. - * - * @param tokenStream the {@link TokenStream} that will be scored. - * @return either a {@link TokenStream} that the Highlighter should continue using (eg - * if you read the tokenSream in this method) or null to continue - * using the same {@link TokenStream} that was passed in. - * @throws IOException - */ - public TokenStream init(TokenStream tokenStream) throws IOException; - - /** - * Called when a new fragment is started for consideration. - * - * @param newFragment the fragment that will be scored next - */ - public void startFragment(TextFragment newFragment); - - /** - * Called for each token in the current fragment. The {@link Highlighter} will - * increment the {@link TokenStream} passed to init on every call. - * - * @return a score which is passed to the {@link Highlighter} class to influence the - * mark-up of the text (this return value is NOT used to score the - * fragment) - */ - public float getTokenScore(); - - /** - * Called when the {@link Highlighter} has no more tokens for the current fragment - - * the Scorer returns the weighting it has derived for the most recent - * fragment, typically based on the results of {@link #getTokenScore()}. - * - */ - public float getFragmentScore(); - -} Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/termvector/BaseFragmentsBuilder.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/termvector/BaseFragmentsBuilder.java (revision 0) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/termvector/BaseFragmentsBuilder.java (revision 0) @@ -0,0 +1,140 @@ +package org.apache.lucene.search.highlight.termvector; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.MapFieldSelector; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.search.highlight.formatting.Formatter; +import org.apache.lucene.search.highlight.formatting.HTMLTagFormatter; +import org.apache.lucene.search.highlight.termvector.FieldFragList.WeightedFragInfo; +import org.apache.lucene.search.highlight.termvector.FieldFragList.WeightedFragInfo.SubInfo; +import org.apache.lucene.search.highlight.termvector.FieldPhraseList.WeightedPhraseInfo.Toffs; + +public abstract class BaseFragmentsBuilder implements FragmentsBuilder { + private final Formatter formatter; + + protected BaseFragmentsBuilder(){ + this(new HTMLTagFormatter("", "")); + } + + protected BaseFragmentsBuilder(Formatter formatter){ + this.formatter = formatter; + } + + static Object checkTagsArgument( Object tags ){ + if( tags instanceof String ) return tags; + else if( tags instanceof String[] ) return tags; + throw new IllegalArgumentException( "type of preTags/postTags must be a String or String[]" ); + } + + public abstract List getWeightedFragInfoList( List src ); + + public String createFragment( IndexReader reader, int docId, + String fieldName, FieldFragList fieldFragList ) throws IOException { + String[] fragments = createFragments( reader, docId, fieldName, fieldFragList, 1 ); + if( fragments == null || fragments.length == 0 ) return null; + return fragments[0]; + } + + public String[] createFragments( IndexReader reader, int docId, + String fieldName, FieldFragList fieldFragList, int maxNumFragments ) + throws IOException { + if( maxNumFragments < 0 ) + throw new IllegalArgumentException( "maxNumFragments(" + maxNumFragments + ") must be positive number." ); + + List fragInfos = getWeightedFragInfoList( fieldFragList.fragInfos ); + + List fragments = new ArrayList( maxNumFragments ); + Field[] values = getFields( reader, docId, fieldName ); + if( values.length == 0 ) return null; + StringBuilder buffer = new StringBuilder(); + int[] nextValueIndex = { 0 }; + for( int n = 0; n < maxNumFragments && n < fragInfos.size(); n++ ){ + WeightedFragInfo fragInfo = fragInfos.get( n ); + fragments.add( makeFragment( buffer, nextValueIndex, values, fragInfo ) ); + } + return fragments.toArray( new String[fragments.size()] ); + } + + @Deprecated + protected String[] getFieldValues( IndexReader reader, int docId, String fieldName) throws IOException { + Document doc = reader.document( docId, new MapFieldSelector( new String[]{ fieldName } ) ); + return doc.getValues( fieldName ); // according to Document class javadoc, this never returns null + } + + protected Field[] getFields( IndexReader reader, int docId, String fieldName) throws IOException { + // according to javadoc, doc.getFields(fieldName) cannot be used with lazy loaded field??? + Document doc = reader.document( docId, new MapFieldSelector( new String[]{ fieldName } ) ); + return doc.getFields( fieldName ); // according to Document class javadoc, this never returns null + } + + @Deprecated + protected String makeFragment( StringBuilder buffer, int[] index, String[] values, WeightedFragInfo fragInfo ){ + final int s = fragInfo.startOffset; + return makeFragment( fragInfo, getFragmentSource( buffer, index, values, s, fragInfo.endOffset ), s ); + } + + protected String makeFragment( StringBuilder buffer, int[] index, Field[] values, WeightedFragInfo fragInfo ){ + final int s = fragInfo.startOffset; + return makeFragment( fragInfo, getFragmentSource( buffer, index, values, s, fragInfo.endOffset ), s ); + } + + private String makeFragment( WeightedFragInfo fragInfo, String src, int s ){ + StringBuilder fragment = new StringBuilder(); + int srcIndex = 0; + for( SubInfo subInfo : fragInfo.subInfos ){ + for( Toffs to : subInfo.termsOffsets ){ + fragment.append( src.substring( srcIndex, to.startOffset - s ) ).append( formatter.getPreTag() ) + .append( src.substring( to.startOffset - s, to.endOffset - s ) ).append( formatter.getPostTag() ); + srcIndex = to.endOffset - s; + } + } + fragment.append( src.substring( srcIndex ) ); + return fragment.toString(); + } + + @Deprecated + protected String getFragmentSource( StringBuilder buffer, int[] index, String[] values, + int startOffset, int endOffset ){ + while( buffer.length() < endOffset && index[0] < values.length ){ + if( index[0] > 0 && values[index[0]].length() > 0 ) + buffer.append( ' ' ); + buffer.append( values[index[0]++] ); + } + int eo = buffer.length() < endOffset ? buffer.length() : endOffset; + return buffer.substring( startOffset, eo ); + } + + protected String getFragmentSource( StringBuilder buffer, int[] index, Field[] values, + int startOffset, int endOffset ){ + while( buffer.length() < endOffset && index[0] < values.length ){ + if( index[0] > 0 && values[index[0]].isTokenized() && values[index[0]].stringValue().length() > 0 ) + buffer.append( ' ' ); + buffer.append( values[index[0]++].stringValue() ); + } + int eo = buffer.length() < endOffset ? buffer.length() : endOffset; + return buffer.substring( startOffset, eo ); + } + +} Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/termvector/FastVectorHighlighter.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/termvector/FastVectorHighlighter.java (revision 0) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/termvector/FastVectorHighlighter.java (revision 0) @@ -0,0 +1,137 @@ +package org.apache.lucene.search.highlight.termvector; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.search.Query; + +/** + * Another highlighter implementation. + * + */ +public class FastVectorHighlighter { + + public static final boolean DEFAULT_PHRASE_HIGHLIGHT = true; + public static final boolean DEFAULT_FIELD_MATCH = true; + private final boolean phraseHighlight; + private final boolean fieldMatch; + private final FragListBuilder fragListBuilder; + private final FragmentsBuilder fragmentsBuilder; + + /** + * the default constructor. + */ + public FastVectorHighlighter(){ + this( DEFAULT_PHRASE_HIGHLIGHT, DEFAULT_FIELD_MATCH ); + } + + /** + * a constructor. Using SimpleFragListBuilder and ScoreOrderFragmentsBuilder. + * + * @param phraseHighlight true or false for phrase highlighting + * @param fieldMatch true of false for field matching + */ + public FastVectorHighlighter( boolean phraseHighlight, boolean fieldMatch ){ + this( phraseHighlight, fieldMatch, new SimpleFragListBuilder(), new ScoreOrderFragmentsBuilder() ); + } + + /** + * a constructor. A FragListBuilder and a FragmentsBuilder can be specified (plugins). + * + * @param phraseHighlight true of false for phrase highlighting + * @param fieldMatch true of false for field matching + * @param fragListBuilder an instance of FragListBuilder + * @param fragmentsBuilder an instance of FragmentsBuilder + */ + public FastVectorHighlighter( boolean phraseHighlight, boolean fieldMatch, + FragListBuilder fragListBuilder, FragmentsBuilder fragmentsBuilder ){ + this.phraseHighlight = phraseHighlight; + this.fieldMatch = fieldMatch; + this.fragListBuilder = fragListBuilder; + this.fragmentsBuilder = fragmentsBuilder; + } + + /** + * create a FieldQuery object. + * + * @param query a query + * @return the created FieldQuery object + */ + public FieldQuery getFieldQuery( Query query ){ + return new FieldQuery( query, phraseHighlight, fieldMatch ); + } + + /** + * return the best fragment. + * + * @param fieldQuery FieldQuery object + * @param reader IndexReader of the index + * @param docId document id to be highlighted + * @param fieldName field of the document to be highlighted + * @param fragCharSize the length (number of chars) of a fragment + * @return the best fragment (snippet) string + * @throws IOException + */ + public final String getBestFragment( final FieldQuery fieldQuery, IndexReader reader, int docId, + String fieldName, int fragCharSize ) throws IOException { + FieldFragList fieldFragList = getFieldFragList( fieldQuery, reader, docId, fieldName, fragCharSize ); + return fragmentsBuilder.createFragment( reader, docId, fieldName, fieldFragList ); + } + + /** + * return the best fragments. + * + * @param fieldQuery FieldQuery object + * @param reader IndexReader of the index + * @param docId document id to be highlighted + * @param fieldName field of the document to be highlighted + * @param fragCharSize the length (number of chars) of a fragment + * @param maxNumFragments maximum number of fragments + * @return created fragments or null when no fragments created. + * size of the array can be less than maxNumFragments + * @throws IOException + */ + public final String[] getBestFragments( final FieldQuery fieldQuery, IndexReader reader, int docId, + String fieldName, int fragCharSize, int maxNumFragments ) throws IOException { + FieldFragList fieldFragList = getFieldFragList( fieldQuery, reader, docId, fieldName, fragCharSize ); + return fragmentsBuilder.createFragments( reader, docId, fieldName, fieldFragList, maxNumFragments ); + } + + private FieldFragList getFieldFragList( final FieldQuery fieldQuery, IndexReader reader, int docId, + String fieldName, int fragCharSize ) throws IOException { + FieldTermStack fieldTermStack = new FieldTermStack( reader, docId, fieldName, fieldQuery ); + FieldPhraseList fieldPhraseList = new FieldPhraseList( fieldTermStack, fieldQuery ); + return fragListBuilder.createFieldFragList( fieldPhraseList, fragCharSize ); + } + + /** + * return whether phraseHighlight or not. + * + * @return whether phraseHighlight or not + */ + public boolean isPhraseHighlight(){ return phraseHighlight; } + + /** + * return whether fieldMatch or not. + * + * @return whether fieldMatch or not + */ + public boolean isFieldMatch(){ return fieldMatch; } +} Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/termvector/FieldFragList.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/termvector/FieldFragList.java (revision 0) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/termvector/FieldFragList.java (revision 0) @@ -0,0 +1,128 @@ +package org.apache.lucene.search.highlight.termvector; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.ArrayList; +import java.util.List; + +import org.apache.lucene.search.highlight.termvector.FieldPhraseList.WeightedPhraseInfo; +import org.apache.lucene.search.highlight.termvector.FieldPhraseList.WeightedPhraseInfo.Toffs; + +/** + * FieldFragList has a list of "frag info" that is used by FragmentsBuilder class + * to create fragments (snippets). + */ +public class FieldFragList { + + List fragInfos = new ArrayList(); + + /** + * a constructor. + * + * @param fragCharSize the length (number of chars) of a fragment + */ + public FieldFragList( int fragCharSize ){ + } + + /** + * convert the list of WeightedPhraseInfo to WeightedFragInfo, then add it to the fragInfos + * + * @param startOffset start offset of the fragment + * @param endOffset end offset of the fragment + * @param phraseInfoList list of WeightedPhraseInfo objects + */ + public void add( int startOffset, int endOffset, List phraseInfoList ){ + fragInfos.add( new WeightedFragInfo( startOffset, endOffset, phraseInfoList ) ); + } + + public static class WeightedFragInfo { + + List subInfos; + float totalBoost; + int startOffset; + int endOffset; + + public WeightedFragInfo( int startOffset, int endOffset, List phraseInfoList ){ + this.startOffset = startOffset; + this.endOffset = endOffset; + subInfos = new ArrayList(); + for( WeightedPhraseInfo phraseInfo : phraseInfoList ){ + SubInfo subInfo = new SubInfo( phraseInfo.text, phraseInfo.termsOffsets, phraseInfo.seqnum ); + subInfos.add( subInfo ); + totalBoost += phraseInfo.boost; + } + } + + public List getSubInfos(){ + return subInfos; + } + + public float getTotalBoost(){ + return totalBoost; + } + + public int getStartOffset(){ + return startOffset; + } + + public int getEndOffset(){ + return endOffset; + } + + @Override + public String toString(){ + StringBuilder sb = new StringBuilder(); + sb.append( "subInfos=(" ); + for( SubInfo si : subInfos ) + sb.append( si.toString() ); + sb.append( ")/" ).append( totalBoost ).append( '(' ).append( startOffset ).append( ',' ).append( endOffset ).append( ')' ); + return sb.toString(); + } + + public static class SubInfo { + final String text; // unnecessary member, just exists for debugging purpose + final List termsOffsets; // usually termsOffsets.size() == 1, + // but if position-gap > 1 and slop > 0 then size() could be greater than 1 + int seqnum; + + SubInfo( String text, List termsOffsets, int seqnum ){ + this.text = text; + this.termsOffsets = termsOffsets; + this.seqnum = seqnum; + } + + public List getTermsOffsets(){ + return termsOffsets; + } + + public int getSeqnum(){ + return seqnum; + } + + @Override + public String toString(){ + StringBuilder sb = new StringBuilder(); + sb.append( text ).append( '(' ); + for( Toffs to : termsOffsets ) + sb.append( to.toString() ); + sb.append( ')' ); + return sb.toString(); + } + } + } +} Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/termvector/FieldPhraseList.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/termvector/FieldPhraseList.java (revision 0) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/termvector/FieldPhraseList.java (revision 0) @@ -0,0 +1,191 @@ +package org.apache.lucene.search.highlight.termvector; +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.ArrayList; +import java.util.LinkedList; +import java.util.List; + +import org.apache.lucene.search.highlight.termvector.FieldQuery.QueryPhraseMap; +import org.apache.lucene.search.highlight.termvector.FieldTermStack.TermInfo; + +/** + * FieldPhraseList has a list of WeightedPhraseInfo that is used by FragListBuilder + * to create a FieldFragList object. + */ +public class FieldPhraseList { + + LinkedList phraseList = new LinkedList(); + + /** + * a constructor. + * + * @param fieldTermStack FieldTermStack object + * @param fieldQuery FieldQuery object + */ + public FieldPhraseList( FieldTermStack fieldTermStack, FieldQuery fieldQuery ){ + final String field = fieldTermStack.getFieldName(); + + LinkedList phraseCandidate = new LinkedList(); + QueryPhraseMap currMap = null; + QueryPhraseMap nextMap = null; + while( !fieldTermStack.isEmpty() ){ + + phraseCandidate.clear(); + + TermInfo ti = fieldTermStack.pop(); + currMap = fieldQuery.getFieldTermMap( field, ti.getText() ); + + // if not found, discard top TermInfo from stack, then try next element + if( currMap == null ) continue; + + // if found, search the longest phrase + phraseCandidate.add( ti ); + while( true ){ + ti = fieldTermStack.pop(); + nextMap = null; + if( ti != null ) + nextMap = currMap.getTermMap( ti.getText() ); + if( ti == null || nextMap == null ){ + if( ti != null ) + fieldTermStack.push( ti ); + if( currMap.isValidTermOrPhrase( phraseCandidate ) ){ + addIfNoOverlap( new WeightedPhraseInfo( phraseCandidate, currMap.getBoost(), currMap.getTermOrPhraseNumber() ) ); + } + else{ + while( phraseCandidate.size() > 1 ){ + fieldTermStack.push( phraseCandidate.removeLast() ); + currMap = fieldQuery.searchPhrase( field, phraseCandidate ); + if( currMap != null ){ + addIfNoOverlap( new WeightedPhraseInfo( phraseCandidate, currMap.getBoost(), currMap.getTermOrPhraseNumber() ) ); + break; + } + } + } + break; + } + else{ + phraseCandidate.add( ti ); + currMap = nextMap; + } + } + } + } + + void addIfNoOverlap( WeightedPhraseInfo wpi ){ + for( WeightedPhraseInfo existWpi : phraseList ){ + if( existWpi.isOffsetOverlap( wpi ) ) return; + } + phraseList.add( wpi ); + } + + public static class WeightedPhraseInfo { + + String text; // unnecessary member, just exists for debugging purpose + List termsOffsets; // usually termsOffsets.size() == 1, + // but if position-gap > 1 and slop > 0 then size() could be greater than 1 + float boost; // query boost + int seqnum; + + public WeightedPhraseInfo( LinkedList terms, float boost ){ + this( terms, boost, 0 ); + } + + public WeightedPhraseInfo( LinkedList terms, float boost, int number ){ + this.boost = boost; + this.seqnum = number; + termsOffsets = new ArrayList( terms.size() ); + TermInfo ti = terms.get( 0 ); + termsOffsets.add( new Toffs( ti.getStartOffset(), ti.getEndOffset() ) ); + if( terms.size() == 1 ){ + text = ti.getText(); + return; + } + StringBuilder sb = new StringBuilder(); + sb.append( ti.getText() ); + int pos = ti.getPosition(); + for( int i = 1; i < terms.size(); i++ ){ + ti = terms.get( i ); + sb.append( ti.getText() ); + if( ti.getPosition() - pos == 1 ){ + Toffs to = termsOffsets.get( termsOffsets.size() - 1 ); + to.setEndOffset( ti.getEndOffset() ); + } + else{ + termsOffsets.add( new Toffs( ti.getStartOffset(), ti.getEndOffset() ) ); + } + pos = ti.getPosition(); + } + text = sb.toString(); + } + + public int getStartOffset(){ + return termsOffsets.get( 0 ).startOffset; + } + + public int getEndOffset(){ + return termsOffsets.get( termsOffsets.size() - 1 ).endOffset; + } + + public boolean isOffsetOverlap( WeightedPhraseInfo other ){ + int so = getStartOffset(); + int eo = getEndOffset(); + int oso = other.getStartOffset(); + int oeo = other.getEndOffset(); + if( so <= oso && oso < eo ) return true; + if( so < oeo && oeo <= eo ) return true; + if( oso <= so && so < oeo ) return true; + if( oso < eo && eo <= oeo ) return true; + return false; + } + + @Override + public String toString(){ + StringBuilder sb = new StringBuilder(); + sb.append( text ).append( '(' ).append( boost ).append( ")(" ); + for( Toffs to : termsOffsets ){ + sb.append( to ); + } + sb.append( ')' ); + return sb.toString(); + } + + public static class Toffs { + int startOffset; + int endOffset; + public Toffs( int startOffset, int endOffset ){ + this.startOffset = startOffset; + this.endOffset = endOffset; + } + public void setEndOffset( int endOffset ){ + this.endOffset = endOffset; + } + public int getStartOffset(){ + return startOffset; + } + public int getEndOffset(){ + return endOffset; + } + @Override + public String toString(){ + StringBuilder sb = new StringBuilder(); + sb.append( '(' ).append( startOffset ).append( ',' ).append( endOffset ).append( ')' ); + return sb.toString(); + } + } + } +} Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/termvector/FieldQuery.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/termvector/FieldQuery.java (revision 0) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/termvector/FieldQuery.java (revision 0) @@ -0,0 +1,399 @@ +package org.apache.lucene.search.highlight.termvector; +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.Collection; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import org.apache.lucene.index.Term; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.DisjunctionMaxQuery; +import org.apache.lucene.search.PhraseQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.highlight.termvector.FieldTermStack.TermInfo; + +/** + * FieldQuery breaks down query object into terms/phrases and keep + * them in QueryPhraseMap structure. + */ +public class FieldQuery { + + final boolean fieldMatch; + + // fieldMatch==true, Map + // fieldMatch==false, Map + Map rootMaps = new HashMap(); + + // fieldMatch==true, Map + // fieldMatch==false, Map + Map> termSetMap = new HashMap>(); + + int termOrPhraseNumber; // used for colored tag support + + FieldQuery( Query query, boolean phraseHighlight, boolean fieldMatch ){ + this.fieldMatch = fieldMatch; + Set flatQueries = new HashSet(); + flatten( query, flatQueries ); + saveTerms( flatQueries ); + Collection expandQueries = expand( flatQueries ); + + for( Query flatQuery : expandQueries ){ + QueryPhraseMap rootMap = getRootMap( flatQuery ); + rootMap.add( flatQuery ); + if( !phraseHighlight && flatQuery instanceof PhraseQuery ){ + PhraseQuery pq = (PhraseQuery)flatQuery; + if( pq.getTerms().length > 1 ){ + for( Term term : pq.getTerms() ) + rootMap.addTerm( term, flatQuery.getBoost() ); + } + } + } + } + + void flatten( Query sourceQuery, Collection flatQueries ){ + if( sourceQuery instanceof BooleanQuery ){ + BooleanQuery bq = (BooleanQuery)sourceQuery; + for( BooleanClause clause : bq.getClauses() ){ + if( !clause.isProhibited() ) + flatten( clause.getQuery(), flatQueries ); + } + } + else if( sourceQuery instanceof DisjunctionMaxQuery ){ + DisjunctionMaxQuery dmq = (DisjunctionMaxQuery)sourceQuery; + for( Query query : dmq ){ + flatten( query, flatQueries ); + } + } + else if( sourceQuery instanceof TermQuery ){ + if( !flatQueries.contains( sourceQuery ) ) + flatQueries.add( sourceQuery ); + } + else if( sourceQuery instanceof PhraseQuery ){ + if( !flatQueries.contains( sourceQuery ) ){ + PhraseQuery pq = (PhraseQuery)sourceQuery; + if( pq.getTerms().length > 1 ) + flatQueries.add( pq ); + else if( pq.getTerms().length == 1 ){ + flatQueries.add( new TermQuery( pq.getTerms()[0] ) ); + } + } + } + // else discard queries + } + + /* + * Create expandQueries from flatQueries. + * + * expandQueries := flatQueries + overlapped phrase queries + * + * ex1) flatQueries={a,b,c} + * => expandQueries={a,b,c} + * ex2) flatQueries={a,"b c","c d"} + * => expandQueries={a,"b c","c d","b c d"} + */ + Collection expand( Collection flatQueries ){ + Set expandQueries = new HashSet(); + for( Iterator i = flatQueries.iterator(); i.hasNext(); ){ + Query query = i.next(); + i.remove(); + expandQueries.add( query ); + if( !( query instanceof PhraseQuery ) ) continue; + for( Iterator j = flatQueries.iterator(); j.hasNext(); ){ + Query qj = j.next(); + if( !( qj instanceof PhraseQuery ) ) continue; + checkOverlap( expandQueries, (PhraseQuery)query, (PhraseQuery)qj ); + } + } + return expandQueries; + } + + /* + * Check if PhraseQuery A and B have overlapped part. + * + * ex1) A="a b", B="b c" => overlap; expandQueries={"a b c"} + * ex2) A="b c", B="a b" => overlap; expandQueries={"a b c"} + * ex3) A="a b", B="c d" => no overlap; expandQueries={} + */ + private void checkOverlap( Collection expandQueries, PhraseQuery a, PhraseQuery b ){ + if( a.getSlop() != b.getSlop() ) return; + Term[] ats = a.getTerms(); + Term[] bts = b.getTerms(); + if( fieldMatch && !ats[0].field().equals( bts[0].field() ) ) return; + checkOverlap( expandQueries, ats, bts, a.getSlop(), a.getBoost() ); + checkOverlap( expandQueries, bts, ats, b.getSlop(), b.getBoost() ); + } + + /* + * Check if src and dest have overlapped part and if it is, create PhraseQueries and add expandQueries. + * + * ex1) src="a b", dest="c d" => no overlap + * ex2) src="a b", dest="a b c" => no overlap + * ex3) src="a b", dest="b c" => overlap; expandQueries={"a b c"} + * ex4) src="a b c", dest="b c d" => overlap; expandQueries={"a b c d"} + * ex5) src="a b c", dest="b c" => no overlap + * ex6) src="a b c", dest="b" => no overlap + * ex7) src="a a a a", dest="a a a" => overlap; + * expandQueries={"a a a a a","a a a a a a"} + * ex8) src="a b c d", dest="b c" => no overlap + */ + private void checkOverlap( Collection expandQueries, Term[] src, Term[] dest, int slop, float boost ){ + // beginning from 1 (not 0) is safe because that the PhraseQuery has multiple terms + // is guaranteed in flatten() method (if PhraseQuery has only one term, flatten() + // converts PhraseQuery to TermQuery) + for( int i = 1; i < src.length; i++ ){ + boolean overlap = true; + for( int j = i; j < src.length; j++ ){ + if( ( j - i ) < dest.length && !src[j].text().equals( dest[j-i].text() ) ){ + overlap = false; + break; + } + } + if( overlap && src.length - i < dest.length ){ + PhraseQuery pq = new PhraseQuery(); + for( Term srcTerm : src ) + pq.add( srcTerm ); + for( int k = src.length - i; k < dest.length; k++ ){ + pq.add( new Term( src[0].field(), dest[k].text() ) ); + } + pq.setSlop( slop ); + pq.setBoost( boost ); + if(!expandQueries.contains( pq ) ) + expandQueries.add( pq ); + } + } + } + + QueryPhraseMap getRootMap( Query query ){ + String key = getKey( query ); + QueryPhraseMap map = rootMaps.get( key ); + if( map == null ){ + map = new QueryPhraseMap( this ); + rootMaps.put( key, map ); + } + return map; + } + + /* + * Return 'key' string. 'key' is the field name of the Query. + * If not fieldMatch, 'key' will be null. + */ + private String getKey( Query query ){ + if( !fieldMatch ) return null; + if( query instanceof TermQuery ) + return ((TermQuery)query).getTerm().field(); + else if ( query instanceof PhraseQuery ){ + PhraseQuery pq = (PhraseQuery)query; + Term[] terms = pq.getTerms(); + return terms[0].field(); + } + else + throw new RuntimeException( "query \"" + query.toString() + "\" must be flatten first." ); + } + + /* + * Save the set of terms in the queries to termSetMap. + * + * ex1) q=name:john + * - fieldMatch==true + * termSetMap=Map<"name",Set<"john">> + * - fieldMatch==false + * termSetMap=Map> + * + * ex2) q=name:john title:manager + * - fieldMatch==true + * termSetMap=Map<"name",Set<"john">, + * "title",Set<"manager">> + * - fieldMatch==false + * termSetMap=Map> + * + * ex3) q=name:"john lennon" + * - fieldMatch==true + * termSetMap=Map<"name",Set<"john","lennon">> + * - fieldMatch==false + * termSetMap=Map> + */ + void saveTerms( Collection flatQueries ){ + for( Query query : flatQueries ){ + Set termSet = getTermSet( query ); + if( query instanceof TermQuery ) + termSet.add( ((TermQuery)query).getTerm().text() ); + else if( query instanceof PhraseQuery ){ + for( Term term : ((PhraseQuery)query).getTerms() ) + termSet.add( term.text() ); + } + else + throw new RuntimeException( "query \"" + query.toString() + "\" must be flatten first." ); + } + } + + private Set getTermSet( Query query ){ + String key = getKey( query ); + Set set = termSetMap.get( key ); + if( set == null ){ + set = new HashSet(); + termSetMap.put( key, set ); + } + return set; + } + + Set getTermSet( String field ){ + return termSetMap.get( fieldMatch ? field : null ); + } + + /** + * + * @param fieldName + * @param term + * @return QueryPhraseMap + */ + public QueryPhraseMap getFieldTermMap( String fieldName, String term ){ + QueryPhraseMap rootMap = getRootMap( fieldName ); + return rootMap == null ? null : rootMap.subMap.get( term ); + } + + /** + * + * @param fieldName + * @param phraseCandidate + * @return QueryPhraseMap + */ + public QueryPhraseMap searchPhrase( String fieldName, final List phraseCandidate ){ + QueryPhraseMap root = getRootMap( fieldName ); + if( root == null ) return null; + return root.searchPhrase( phraseCandidate ); + } + + private QueryPhraseMap getRootMap( String fieldName ){ + return rootMaps.get( fieldMatch ? fieldName : null ); + } + + int nextTermOrPhraseNumber(){ + return termOrPhraseNumber++; + } + + public static class QueryPhraseMap { + + boolean terminal; + int slop; // valid if terminal == true and phraseHighlight == true + float boost; // valid if terminal == true + int termOrPhraseNumber; // valid if terminal == true + FieldQuery fieldQuery; + Map subMap = new HashMap(); + + public QueryPhraseMap( FieldQuery fieldQuery ){ + this.fieldQuery = fieldQuery; + } + + void addTerm( Term term, float boost ){ + QueryPhraseMap map = getOrNewMap( subMap, term.text() ); + map.markTerminal( boost ); + } + + private QueryPhraseMap getOrNewMap( Map subMap, String term ){ + QueryPhraseMap map = subMap.get( term ); + if( map == null ){ + map = new QueryPhraseMap( fieldQuery ); + subMap.put( term, map ); + } + return map; + } + + void add( Query query ){ + if( query instanceof TermQuery ){ + addTerm( ((TermQuery)query).getTerm(), query.getBoost() ); + } + else if( query instanceof PhraseQuery ){ + PhraseQuery pq = (PhraseQuery)query; + Term[] terms = pq.getTerms(); + Map map = subMap; + QueryPhraseMap qpm = null; + for( Term term : terms ){ + qpm = getOrNewMap( map, term.text() ); + map = qpm.subMap; + } + qpm.markTerminal( pq.getSlop(), pq.getBoost() ); + } + else + throw new RuntimeException( "query \"" + query.toString() + "\" must be flatten first." ); + } + + public QueryPhraseMap getTermMap( String term ){ + return subMap.get( term ); + } + + private void markTerminal( float boost ){ + markTerminal( 0, boost ); + } + + private void markTerminal( int slop, float boost ){ + this.terminal = true; + this.slop = slop; + this.boost = boost; + this.termOrPhraseNumber = fieldQuery.nextTermOrPhraseNumber(); + } + + public boolean isTerminal(){ + return terminal; + } + + public int getSlop(){ + return slop; + } + + public float getBoost(){ + return boost; + } + + public int getTermOrPhraseNumber(){ + return termOrPhraseNumber; + } + + public QueryPhraseMap searchPhrase( final List phraseCandidate ){ + QueryPhraseMap currMap = this; + for( TermInfo ti : phraseCandidate ){ + currMap = currMap.subMap.get( ti.getText() ); + if( currMap == null ) return null; + } + return currMap.isValidTermOrPhrase( phraseCandidate ) ? currMap : null; + } + + public boolean isValidTermOrPhrase( final List phraseCandidate ){ + // check terminal + if( !terminal ) return false; + + // if the candidate is a term, it is valid + if( phraseCandidate.size() == 1 ) return true; + + // else check whether the candidate is valid phrase + // compare position-gaps between terms to slop + int pos = phraseCandidate.get( 0 ).getPosition(); + for( int i = 1; i < phraseCandidate.size(); i++ ){ + int nextPos = phraseCandidate.get( i ).getPosition(); + if( Math.abs( nextPos - pos - 1 ) > slop ) return false; + pos = nextPos; + } + return true; + } + } +} Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/termvector/FieldTermStack.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/termvector/FieldTermStack.java (revision 0) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/termvector/FieldTermStack.java (revision 0) @@ -0,0 +1,159 @@ +package org.apache.lucene.search.highlight.termvector; +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Collections; +import java.util.LinkedList; +import java.util.Set; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.TermFreqVector; +import org.apache.lucene.index.TermPositionVector; +import org.apache.lucene.index.TermVectorOffsetInfo; + +/** + * FieldTermStack is a stack that keeps query terms in the specified field + * of the document to be highlighted. + */ +public class FieldTermStack { + + private final String fieldName; + LinkedList termList = new LinkedList(); + + //public static void main( String[] args ) throws Exception { + // Analyzer analyzer = new WhitespaceAnalyzer(Version.LUCENE_CURRENT); + // QueryParser parser = new QueryParser(Version.LUCENE_CURRENT, "f", analyzer ); + // Query query = parser.parse( "a x:b" ); + // FieldQuery fieldQuery = new FieldQuery( query, true, false ); + + // Directory dir = new RAMDirectory(); + // IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)); + // Document doc = new Document(); + // doc.add( new Field( "f", "a a a b b c a b b c d e f", Store.YES, Index.ANALYZED, TermVector.WITH_POSITIONS_OFFSETS ) ); + // doc.add( new Field( "f", "b a b a f", Store.YES, Index.ANALYZED, TermVector.WITH_POSITIONS_OFFSETS ) ); + // writer.addDocument( doc ); + // writer.close(); + + // IndexReader reader = IndexReader.open( dir, true ); + // new FieldTermStack( reader, 0, "f", fieldQuery ); + // reader.close(); + //} + + /** + * a constructor. + * + * @param reader IndexReader of the index + * @param docId document id to be highlighted + * @param fieldName field of the document to be highlighted + * @param fieldQuery FieldQuery object + * @throws IOException + */ + public FieldTermStack( IndexReader reader, int docId, String fieldName, final FieldQuery fieldQuery ) throws IOException { + this.fieldName = fieldName; + + TermFreqVector tfv = reader.getTermFreqVector( docId, fieldName ); + if( tfv == null ) return; // just return to make null snippets + TermPositionVector tpv = null; + try{ + tpv = (TermPositionVector)tfv; + } + catch( ClassCastException e ){ + return; // just return to make null snippets + } + + Set termSet = fieldQuery.getTermSet( fieldName ); + // just return to make null snippet if un-matched fieldName specified when fieldMatch == true + if( termSet == null ) return; + + for( String term : tpv.getTerms() ){ + if( !termSet.contains( term ) ) continue; + int index = tpv.indexOf( term ); + TermVectorOffsetInfo[] tvois = tpv.getOffsets( index ); + if( tvois == null ) return; // just return to make null snippets + int[] poss = tpv.getTermPositions( index ); + if( poss == null ) return; // just return to make null snippets + for( int i = 0; i < tvois.length; i++ ) + termList.add( new TermInfo( term, tvois[i].getStartOffset(), tvois[i].getEndOffset(), poss[i] ) ); + } + + // sort by position + Collections.sort( termList ); + } + + /** + * @return field name + */ + public String getFieldName(){ + return fieldName; + } + + /** + * @return the top TermInfo object of the stack + */ + public TermInfo pop(){ + return termList.poll(); + } + + /** + * @param termInfo the TermInfo object to be put on the top of the stack + */ + public void push( TermInfo termInfo ){ + // termList.push( termInfo ); // avoid Java 1.6 feature + termList.addFirst( termInfo ); + } + + /** + * to know whether the stack is empty + * + * @return true if the stack is empty, false if not + */ + public boolean isEmpty(){ + return termList == null || termList.size() == 0; + } + + public static class TermInfo implements Comparable{ + + final String text; + final int startOffset; + final int endOffset; + final int position; + + TermInfo( String text, int startOffset, int endOffset, int position ){ + this.text = text; + this.startOffset = startOffset; + this.endOffset = endOffset; + this.position = position; + } + + public String getText(){ return text; } + public int getStartOffset(){ return startOffset; } + public int getEndOffset(){ return endOffset; } + public int getPosition(){ return position; } + + @Override + public String toString(){ + StringBuilder sb = new StringBuilder(); + sb.append( text ).append( '(' ).append(startOffset).append( ',' ).append( endOffset ).append( ',' ).append( position ).append( ')' ); + return sb.toString(); + } + + public int compareTo( TermInfo o ) { + return ( this.position - o.position ); + } + } +} Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/termvector/FragListBuilder.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/termvector/FragListBuilder.java (revision 0) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/termvector/FragListBuilder.java (revision 0) @@ -0,0 +1,34 @@ +package org.apache.lucene.search.highlight.termvector; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * FragListBuilder is an interface for FieldFragList builder classes. + * A FragListBuilder class can be plugged in to Highlighter. + */ +public interface FragListBuilder { + + /** + * create a FieldFragList. + * + * @param fieldPhraseList FieldPhraseList object + * @param fragCharSize the length (number of chars) of a fragment + * @return the created FieldFragList object + */ + public FieldFragList createFieldFragList( FieldPhraseList fieldPhraseList, int fragCharSize ); +} Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/termvector/FragmentsBuilder.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/termvector/FragmentsBuilder.java (revision 0) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/termvector/FragmentsBuilder.java (revision 0) @@ -0,0 +1,57 @@ +package org.apache.lucene.search.highlight.termvector; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.index.IndexReader; + +/** + * FragmentsBuilder is an interface for fragments (snippets) builder classes. + * A FragmentsBuilder class can be plugged in to Highlighter. + */ +public interface FragmentsBuilder { + + /** + * create a fragment. + * + * @param reader IndexReader of the index + * @param docId document id to be highlighted + * @param fieldName field of the document to be highlighted + * @param fieldFragList FieldFragList object + * @return a created fragment or null when no fragment created + * @throws IOException + */ + public String createFragment( IndexReader reader, int docId, String fieldName, + FieldFragList fieldFragList ) throws IOException; + + /** + * create multiple fragments. + * + * @param reader IndexReader of the index + * @param docId document id to be highlighter + * @param fieldName field of the document to be highlighted + * @param fieldFragList FieldFragList object + * @param maxNumFragments maximum number of fragments + * @return created fragments or null when no fragments created. + * size of the array can be less than maxNumFragments + * @throws IOException + */ + public String[] createFragments( IndexReader reader, int docId, String fieldName, + FieldFragList fieldFragList, int maxNumFragments ) throws IOException; +} Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/termvector/ScoreOrderFragmentsBuilder.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/termvector/ScoreOrderFragmentsBuilder.java (revision 0) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/termvector/ScoreOrderFragmentsBuilder.java (revision 0) @@ -0,0 +1,68 @@ +package org.apache.lucene.search.highlight.termvector; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.Collections; +import java.util.Comparator; +import java.util.List; + +import org.apache.lucene.search.highlight.formatting.Formatter; +import org.apache.lucene.search.highlight.termvector.FieldFragList.WeightedFragInfo; + +/** + * An implementation of FragmentsBuilder that outputs score-order fragments. + */ +public class ScoreOrderFragmentsBuilder extends BaseFragmentsBuilder { + + /** + * a constructor. + */ + public ScoreOrderFragmentsBuilder(){ + super(); + } + + /** + * a constructor. + */ + public ScoreOrderFragmentsBuilder(Formatter formatter){ + super(formatter); + } + + /** + * Sort by score the list of WeightedFragInfo + */ + @Override + public List getWeightedFragInfoList( List src ) { + Collections.sort( src, new ScoreComparator() ); + return src; + } + + public static class ScoreComparator implements Comparator { + + public int compare( WeightedFragInfo o1, WeightedFragInfo o2 ) { + if( o1.totalBoost > o2.totalBoost ) return -1; + else if( o1.totalBoost < o2.totalBoost ) return 1; + // if same score then check startOffset + else{ + if( o1.startOffset < o2.startOffset ) return -1; + else if( o1.startOffset > o2.startOffset ) return 1; + } + return 0; + } + } +} Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/termvector/SimpleFragListBuilder.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/termvector/SimpleFragListBuilder.java (revision 0) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/termvector/SimpleFragListBuilder.java (revision 0) @@ -0,0 +1,84 @@ +package org.apache.lucene.search.highlight.termvector; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; + +import org.apache.lucene.search.highlight.termvector.FieldPhraseList.WeightedPhraseInfo; + +/** + * A simple implementation of FragListBuilder. + */ +public class SimpleFragListBuilder implements FragListBuilder { + + public static final int MARGIN = 6; + public static final int MIN_FRAG_CHAR_SIZE = MARGIN * 3; + + public FieldFragList createFieldFragList(FieldPhraseList fieldPhraseList, int fragCharSize) { + if( fragCharSize < MIN_FRAG_CHAR_SIZE ) + throw new IllegalArgumentException( "fragCharSize(" + fragCharSize + ") is too small. It must be " + + MIN_FRAG_CHAR_SIZE + " or higher." ); + + FieldFragList ffl = new FieldFragList( fragCharSize ); + + List wpil = new ArrayList(); + Iterator ite = fieldPhraseList.phraseList.iterator(); + WeightedPhraseInfo phraseInfo = null; + int startOffset = 0; + boolean taken = false; + while( true ){ + if( !taken ){ + if( !ite.hasNext() ) break; + phraseInfo = ite.next(); + } + taken = false; + if( phraseInfo == null ) break; + + // if the phrase violates the border of previous fragment, discard it and try next phrase + if( phraseInfo.getStartOffset() < startOffset ) continue; + + wpil.clear(); + wpil.add( phraseInfo ); + int st = phraseInfo.getStartOffset() - MARGIN < startOffset ? + startOffset : phraseInfo.getStartOffset() - MARGIN; + int en = st + fragCharSize; + if( phraseInfo.getEndOffset() > en ) + en = phraseInfo.getEndOffset(); + startOffset = en; + + while( true ){ + if( ite.hasNext() ){ + phraseInfo = ite.next(); + taken = true; + if( phraseInfo == null ) break; + } + else + break; + if( phraseInfo.getEndOffset() <= en ) + wpil.add( phraseInfo ); + else + break; + } + ffl.add( st, en, wpil ); + } + return ffl; + } + +} Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/termvector/SimpleFragmentsBuilder.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/termvector/SimpleFragmentsBuilder.java (revision 0) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/termvector/SimpleFragmentsBuilder.java (revision 0) @@ -0,0 +1,52 @@ +package org.apache.lucene.search.highlight.termvector; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.List; + +import org.apache.lucene.search.highlight.formatting.Formatter; +import org.apache.lucene.search.highlight.termvector.FieldFragList.WeightedFragInfo; + +/** + * A simple implementation of FragmentsBuilder. + * + */ +public class SimpleFragmentsBuilder extends BaseFragmentsBuilder { + + /** + * a constructor. + */ + public SimpleFragmentsBuilder() { + super(); + } + + /** + * a constructor. + */ + public SimpleFragmentsBuilder(Formatter formatter) { + super(formatter); + } + + /** + * do nothing. return the source list. + */ + @Override + public List getWeightedFragInfoList( List src ) { + return src; + } +} Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/termvector/package.html =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/termvector/package.html (revision 0) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/termvector/package.html (revision 0) @@ -0,0 +1,143 @@ + + + + +This is an another highlighter implementation. + +

Features

+
    +
  • fast for large docs
  • +
  • support N-gram fields
  • +
  • support phrase-unit highlighting with slops
  • +
  • need Java 1.5
  • +
  • highlight fields need to be TermVector.WITH_POSITIONS_OFFSETS
  • +
  • take into account query boost to score fragments
  • +
  • support colored highlight tags
  • +
  • pluggable FragListBuilder
  • +
  • pluggable FragmentsBuilder
  • +
+ +

Algorithm

+

To explain the algorithm, let's use the following sample text + (to be highlighted) and user query:

+ + + + + + + + + + +
Sample TextLucene is a search engine library.
User QueryLucene^2 OR "search library"~1
+ +

The user query is a BooleanQuery that consists of TermQuery("Lucene") +with boost of 2 and PhraseQuery("search library") with slop of 1.

+

For your convenience, here is the offsets and positions info of the +sample text.

+ +
++--------+-----------------------------------+
+|        |          1111111111222222222233333|
+|  offset|01234567890123456789012345678901234|
++--------+-----------------------------------+
+|document|Lucene is a search engine library. |
++--------*-----------------------------------+
+|position|0      1  2 3      4      5        |
++--------*-----------------------------------+
+
+ +

Step 1.

+

In Step 1, Fast Vector Highlighter generates {@link org.apache.lucene.search.vectorhighlight.FieldQuery.QueryPhraseMap} from the user query. +QueryPhraseMap consists of the following members:

+
+public class QueryPhraseMap {
+  boolean terminal;
+  int slop;   // valid if terminal == true and phraseHighlight == true
+  float boost;  // valid if terminal == true
+  Map<String, QueryPhraseMap> subMap;
+} 
+
+

QueryPhraseMap has subMap. The key of the subMap is a term +text in the user query and the value is a subsequent QueryPhraseMap. +If the query is a term (not phrase), then the subsequent QueryPhraseMap +is marked as terminal. If the query is a phrase, then the subsequent QueryPhraseMap +is not a terminal and it has the next term text in the phrase.

+ +

From the sample user query, the following QueryPhraseMap +will be generated:

+
+   QueryPhraseMap
++--------+-+  +-------+-+
+|"Lucene"|o+->|boost=2|*|  * : terminal
++--------+-+  +-------+-+
+
++--------+-+  +---------+-+  +-------+------+-+
+|"search"|o+->|"library"|o+->|boost=1|slop=1|*|
++--------+-+  +---------+-+  +-------+------+-+
+
+ +

Step 2.

+

In Step 2, Fast Vector Highlighter generates {@link org.apache.lucene.search.vectorhighlight.FieldTermStack}. Fast Vector Highlighter uses {@link org.apache.lucene.index.TermFreqVector} data +(must be stored {@link org.apache.lucene.document.Field.TermVector#WITH_POSITIONS_OFFSETS}) +to generate it. FieldTermStack keeps the terms in the user query. +Therefore, in this sample case, Fast Vector Highlighter generates the following FieldTermStack:

+
+   FieldTermStack
++------------------+
+|"Lucene"(0,6,0)   |
++------------------+
+|"search"(12,18,3) |
++------------------+
+|"library"(26,33,5)|
++------------------+
+where : "termText"(startOffset,endOffset,position)
+
+

Step 3.

+

In Step 3, Fast Vector Highlighter generates {@link org.apache.lucene.search.vectorhighlight.FieldPhraseList} +by reference to QueryPhraseMap and FieldTermStack.

+
+   FieldPhraseList
++----------------+-----------------+---+
+|"Lucene"        |[(0,6)]          |w=2|
++----------------+-----------------+---+
+|"search library"|[(12,18),(26,33)]|w=1|
++----------------+-----------------+---+
+
+

The type of each entry is WeightedPhraseInfo that consists of +an array of terms offsets and weight. The weight (Fast Vector Highlighter uses query boost to +calculate the weight) will be taken into account when Fast Vector Highlighter creates +{@link org.apache.lucene.search.vectorhighlight.FieldFragList} in the next step.

+

Step 4.

+

In Step 4, Fast Vector Highlighter creates FieldFragList by reference to +FieldPhraseList. In this sample case, the following +FieldFragList will be generated:

+
+   FieldFragList
++---------------------------------+
+|"Lucene"[(0,6)]                  |
+|"search library"[(12,18),(26,33)]|
+|totalBoost=3                     |
++---------------------------------+
+
+

Step 5.

+

In Step 5, by using FieldFragList and the field stored data, +Fast Vector Highlighter creates highlighted snippets!

+ + Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/SpanGradientFormatter.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/SpanGradientFormatter.java (revision 956773) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/SpanGradientFormatter.java (working copy) @@ -1,78 +0,0 @@ -package org.apache.lucene.search.highlight; -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * Formats text with different color intensity depending on the score of the - * term using the span tag. GradientFormatter uses a bgcolor argument to the font tag which - * doesn't work in Mozilla, thus this class. - * - * @see GradientFormatter - * - */ - -public class SpanGradientFormatter - extends GradientFormatter -{ - public SpanGradientFormatter(float maxScore, String minForegroundColor, - String maxForegroundColor, String minBackgroundColor, - String maxBackgroundColor) - { - super( maxScore, minForegroundColor, - maxForegroundColor, minBackgroundColor, - maxBackgroundColor); - } - - - - @Override - public String highlightTerm(String originalText, TokenGroup tokenGroup) - { - if (tokenGroup.getTotalScore() == 0) - return originalText; - float score = tokenGroup.getTotalScore(); - if (score == 0) - { - return originalText; - } - - // try to size sb correctly - StringBuilder sb = new StringBuilder( originalText.length() + EXTRA); - - sb.append(""); - sb.append(originalText); - sb.append(""); - return sb.toString(); - } - - // guess how much extra text we'll add to the text we're highlighting to try to avoid a StringBuilder resize - private static final String TEMPLATE = "..."; - private static final int EXTRA = TEMPLATE.length(); -} Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldTermStack.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldTermStack.java (revision 956773) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldTermStack.java (working copy) @@ -1,159 +0,0 @@ -package org.apache.lucene.search.vectorhighlight; -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; -import java.util.Collections; -import java.util.LinkedList; -import java.util.Set; - -import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.TermFreqVector; -import org.apache.lucene.index.TermPositionVector; -import org.apache.lucene.index.TermVectorOffsetInfo; - -/** - * FieldTermStack is a stack that keeps query terms in the specified field - * of the document to be highlighted. - */ -public class FieldTermStack { - - private final String fieldName; - LinkedList termList = new LinkedList(); - - //public static void main( String[] args ) throws Exception { - // Analyzer analyzer = new WhitespaceAnalyzer(Version.LUCENE_CURRENT); - // QueryParser parser = new QueryParser(Version.LUCENE_CURRENT, "f", analyzer ); - // Query query = parser.parse( "a x:b" ); - // FieldQuery fieldQuery = new FieldQuery( query, true, false ); - - // Directory dir = new RAMDirectory(); - // IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)); - // Document doc = new Document(); - // doc.add( new Field( "f", "a a a b b c a b b c d e f", Store.YES, Index.ANALYZED, TermVector.WITH_POSITIONS_OFFSETS ) ); - // doc.add( new Field( "f", "b a b a f", Store.YES, Index.ANALYZED, TermVector.WITH_POSITIONS_OFFSETS ) ); - // writer.addDocument( doc ); - // writer.close(); - - // IndexReader reader = IndexReader.open( dir, true ); - // new FieldTermStack( reader, 0, "f", fieldQuery ); - // reader.close(); - //} - - /** - * a constructor. - * - * @param reader IndexReader of the index - * @param docId document id to be highlighted - * @param fieldName field of the document to be highlighted - * @param fieldQuery FieldQuery object - * @throws IOException - */ - public FieldTermStack( IndexReader reader, int docId, String fieldName, final FieldQuery fieldQuery ) throws IOException { - this.fieldName = fieldName; - - TermFreqVector tfv = reader.getTermFreqVector( docId, fieldName ); - if( tfv == null ) return; // just return to make null snippets - TermPositionVector tpv = null; - try{ - tpv = (TermPositionVector)tfv; - } - catch( ClassCastException e ){ - return; // just return to make null snippets - } - - Set termSet = fieldQuery.getTermSet( fieldName ); - // just return to make null snippet if un-matched fieldName specified when fieldMatch == true - if( termSet == null ) return; - - for( String term : tpv.getTerms() ){ - if( !termSet.contains( term ) ) continue; - int index = tpv.indexOf( term ); - TermVectorOffsetInfo[] tvois = tpv.getOffsets( index ); - if( tvois == null ) return; // just return to make null snippets - int[] poss = tpv.getTermPositions( index ); - if( poss == null ) return; // just return to make null snippets - for( int i = 0; i < tvois.length; i++ ) - termList.add( new TermInfo( term, tvois[i].getStartOffset(), tvois[i].getEndOffset(), poss[i] ) ); - } - - // sort by position - Collections.sort( termList ); - } - - /** - * @return field name - */ - public String getFieldName(){ - return fieldName; - } - - /** - * @return the top TermInfo object of the stack - */ - public TermInfo pop(){ - return termList.poll(); - } - - /** - * @param termInfo the TermInfo object to be put on the top of the stack - */ - public void push( TermInfo termInfo ){ - // termList.push( termInfo ); // avoid Java 1.6 feature - termList.addFirst( termInfo ); - } - - /** - * to know whether the stack is empty - * - * @return true if the stack is empty, false if not - */ - public boolean isEmpty(){ - return termList == null || termList.size() == 0; - } - - public static class TermInfo implements Comparable{ - - final String text; - final int startOffset; - final int endOffset; - final int position; - - TermInfo( String text, int startOffset, int endOffset, int position ){ - this.text = text; - this.startOffset = startOffset; - this.endOffset = endOffset; - this.position = position; - } - - public String getText(){ return text; } - public int getStartOffset(){ return startOffset; } - public int getEndOffset(){ return endOffset; } - public int getPosition(){ return position; } - - @Override - public String toString(){ - StringBuilder sb = new StringBuilder(); - sb.append( text ).append( '(' ).append(startOffset).append( ',' ).append( endOffset ).append( ',' ).append( position ).append( ')' ); - return sb.toString(); - } - - public int compareTo( TermInfo o ) { - return ( this.position - o.position ); - } - } -} Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldQuery.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldQuery.java (revision 956773) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldQuery.java (working copy) @@ -1,399 +0,0 @@ -package org.apache.lucene.search.vectorhighlight; -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.util.Collection; -import java.util.HashMap; -import java.util.HashSet; -import java.util.Iterator; -import java.util.List; -import java.util.Map; -import java.util.Set; - -import org.apache.lucene.index.Term; -import org.apache.lucene.search.BooleanClause; -import org.apache.lucene.search.BooleanQuery; -import org.apache.lucene.search.DisjunctionMaxQuery; -import org.apache.lucene.search.PhraseQuery; -import org.apache.lucene.search.Query; -import org.apache.lucene.search.TermQuery; -import org.apache.lucene.search.vectorhighlight.FieldTermStack.TermInfo; - -/** - * FieldQuery breaks down query object into terms/phrases and keep - * them in QueryPhraseMap structure. - */ -public class FieldQuery { - - final boolean fieldMatch; - - // fieldMatch==true, Map - // fieldMatch==false, Map - Map rootMaps = new HashMap(); - - // fieldMatch==true, Map - // fieldMatch==false, Map - Map> termSetMap = new HashMap>(); - - int termOrPhraseNumber; // used for colored tag support - - FieldQuery( Query query, boolean phraseHighlight, boolean fieldMatch ){ - this.fieldMatch = fieldMatch; - Set flatQueries = new HashSet(); - flatten( query, flatQueries ); - saveTerms( flatQueries ); - Collection expandQueries = expand( flatQueries ); - - for( Query flatQuery : expandQueries ){ - QueryPhraseMap rootMap = getRootMap( flatQuery ); - rootMap.add( flatQuery ); - if( !phraseHighlight && flatQuery instanceof PhraseQuery ){ - PhraseQuery pq = (PhraseQuery)flatQuery; - if( pq.getTerms().length > 1 ){ - for( Term term : pq.getTerms() ) - rootMap.addTerm( term, flatQuery.getBoost() ); - } - } - } - } - - void flatten( Query sourceQuery, Collection flatQueries ){ - if( sourceQuery instanceof BooleanQuery ){ - BooleanQuery bq = (BooleanQuery)sourceQuery; - for( BooleanClause clause : bq.getClauses() ){ - if( !clause.isProhibited() ) - flatten( clause.getQuery(), flatQueries ); - } - } - else if( sourceQuery instanceof DisjunctionMaxQuery ){ - DisjunctionMaxQuery dmq = (DisjunctionMaxQuery)sourceQuery; - for( Query query : dmq ){ - flatten( query, flatQueries ); - } - } - else if( sourceQuery instanceof TermQuery ){ - if( !flatQueries.contains( sourceQuery ) ) - flatQueries.add( sourceQuery ); - } - else if( sourceQuery instanceof PhraseQuery ){ - if( !flatQueries.contains( sourceQuery ) ){ - PhraseQuery pq = (PhraseQuery)sourceQuery; - if( pq.getTerms().length > 1 ) - flatQueries.add( pq ); - else if( pq.getTerms().length == 1 ){ - flatQueries.add( new TermQuery( pq.getTerms()[0] ) ); - } - } - } - // else discard queries - } - - /* - * Create expandQueries from flatQueries. - * - * expandQueries := flatQueries + overlapped phrase queries - * - * ex1) flatQueries={a,b,c} - * => expandQueries={a,b,c} - * ex2) flatQueries={a,"b c","c d"} - * => expandQueries={a,"b c","c d","b c d"} - */ - Collection expand( Collection flatQueries ){ - Set expandQueries = new HashSet(); - for( Iterator i = flatQueries.iterator(); i.hasNext(); ){ - Query query = i.next(); - i.remove(); - expandQueries.add( query ); - if( !( query instanceof PhraseQuery ) ) continue; - for( Iterator j = flatQueries.iterator(); j.hasNext(); ){ - Query qj = j.next(); - if( !( qj instanceof PhraseQuery ) ) continue; - checkOverlap( expandQueries, (PhraseQuery)query, (PhraseQuery)qj ); - } - } - return expandQueries; - } - - /* - * Check if PhraseQuery A and B have overlapped part. - * - * ex1) A="a b", B="b c" => overlap; expandQueries={"a b c"} - * ex2) A="b c", B="a b" => overlap; expandQueries={"a b c"} - * ex3) A="a b", B="c d" => no overlap; expandQueries={} - */ - private void checkOverlap( Collection expandQueries, PhraseQuery a, PhraseQuery b ){ - if( a.getSlop() != b.getSlop() ) return; - Term[] ats = a.getTerms(); - Term[] bts = b.getTerms(); - if( fieldMatch && !ats[0].field().equals( bts[0].field() ) ) return; - checkOverlap( expandQueries, ats, bts, a.getSlop(), a.getBoost() ); - checkOverlap( expandQueries, bts, ats, b.getSlop(), b.getBoost() ); - } - - /* - * Check if src and dest have overlapped part and if it is, create PhraseQueries and add expandQueries. - * - * ex1) src="a b", dest="c d" => no overlap - * ex2) src="a b", dest="a b c" => no overlap - * ex3) src="a b", dest="b c" => overlap; expandQueries={"a b c"} - * ex4) src="a b c", dest="b c d" => overlap; expandQueries={"a b c d"} - * ex5) src="a b c", dest="b c" => no overlap - * ex6) src="a b c", dest="b" => no overlap - * ex7) src="a a a a", dest="a a a" => overlap; - * expandQueries={"a a a a a","a a a a a a"} - * ex8) src="a b c d", dest="b c" => no overlap - */ - private void checkOverlap( Collection expandQueries, Term[] src, Term[] dest, int slop, float boost ){ - // beginning from 1 (not 0) is safe because that the PhraseQuery has multiple terms - // is guaranteed in flatten() method (if PhraseQuery has only one term, flatten() - // converts PhraseQuery to TermQuery) - for( int i = 1; i < src.length; i++ ){ - boolean overlap = true; - for( int j = i; j < src.length; j++ ){ - if( ( j - i ) < dest.length && !src[j].text().equals( dest[j-i].text() ) ){ - overlap = false; - break; - } - } - if( overlap && src.length - i < dest.length ){ - PhraseQuery pq = new PhraseQuery(); - for( Term srcTerm : src ) - pq.add( srcTerm ); - for( int k = src.length - i; k < dest.length; k++ ){ - pq.add( new Term( src[0].field(), dest[k].text() ) ); - } - pq.setSlop( slop ); - pq.setBoost( boost ); - if(!expandQueries.contains( pq ) ) - expandQueries.add( pq ); - } - } - } - - QueryPhraseMap getRootMap( Query query ){ - String key = getKey( query ); - QueryPhraseMap map = rootMaps.get( key ); - if( map == null ){ - map = new QueryPhraseMap( this ); - rootMaps.put( key, map ); - } - return map; - } - - /* - * Return 'key' string. 'key' is the field name of the Query. - * If not fieldMatch, 'key' will be null. - */ - private String getKey( Query query ){ - if( !fieldMatch ) return null; - if( query instanceof TermQuery ) - return ((TermQuery)query).getTerm().field(); - else if ( query instanceof PhraseQuery ){ - PhraseQuery pq = (PhraseQuery)query; - Term[] terms = pq.getTerms(); - return terms[0].field(); - } - else - throw new RuntimeException( "query \"" + query.toString() + "\" must be flatten first." ); - } - - /* - * Save the set of terms in the queries to termSetMap. - * - * ex1) q=name:john - * - fieldMatch==true - * termSetMap=Map<"name",Set<"john">> - * - fieldMatch==false - * termSetMap=Map> - * - * ex2) q=name:john title:manager - * - fieldMatch==true - * termSetMap=Map<"name",Set<"john">, - * "title",Set<"manager">> - * - fieldMatch==false - * termSetMap=Map> - * - * ex3) q=name:"john lennon" - * - fieldMatch==true - * termSetMap=Map<"name",Set<"john","lennon">> - * - fieldMatch==false - * termSetMap=Map> - */ - void saveTerms( Collection flatQueries ){ - for( Query query : flatQueries ){ - Set termSet = getTermSet( query ); - if( query instanceof TermQuery ) - termSet.add( ((TermQuery)query).getTerm().text() ); - else if( query instanceof PhraseQuery ){ - for( Term term : ((PhraseQuery)query).getTerms() ) - termSet.add( term.text() ); - } - else - throw new RuntimeException( "query \"" + query.toString() + "\" must be flatten first." ); - } - } - - private Set getTermSet( Query query ){ - String key = getKey( query ); - Set set = termSetMap.get( key ); - if( set == null ){ - set = new HashSet(); - termSetMap.put( key, set ); - } - return set; - } - - Set getTermSet( String field ){ - return termSetMap.get( fieldMatch ? field : null ); - } - - /** - * - * @param fieldName - * @param term - * @return QueryPhraseMap - */ - public QueryPhraseMap getFieldTermMap( String fieldName, String term ){ - QueryPhraseMap rootMap = getRootMap( fieldName ); - return rootMap == null ? null : rootMap.subMap.get( term ); - } - - /** - * - * @param fieldName - * @param phraseCandidate - * @return QueryPhraseMap - */ - public QueryPhraseMap searchPhrase( String fieldName, final List phraseCandidate ){ - QueryPhraseMap root = getRootMap( fieldName ); - if( root == null ) return null; - return root.searchPhrase( phraseCandidate ); - } - - private QueryPhraseMap getRootMap( String fieldName ){ - return rootMaps.get( fieldMatch ? fieldName : null ); - } - - int nextTermOrPhraseNumber(){ - return termOrPhraseNumber++; - } - - public static class QueryPhraseMap { - - boolean terminal; - int slop; // valid if terminal == true and phraseHighlight == true - float boost; // valid if terminal == true - int termOrPhraseNumber; // valid if terminal == true - FieldQuery fieldQuery; - Map subMap = new HashMap(); - - public QueryPhraseMap( FieldQuery fieldQuery ){ - this.fieldQuery = fieldQuery; - } - - void addTerm( Term term, float boost ){ - QueryPhraseMap map = getOrNewMap( subMap, term.text() ); - map.markTerminal( boost ); - } - - private QueryPhraseMap getOrNewMap( Map subMap, String term ){ - QueryPhraseMap map = subMap.get( term ); - if( map == null ){ - map = new QueryPhraseMap( fieldQuery ); - subMap.put( term, map ); - } - return map; - } - - void add( Query query ){ - if( query instanceof TermQuery ){ - addTerm( ((TermQuery)query).getTerm(), query.getBoost() ); - } - else if( query instanceof PhraseQuery ){ - PhraseQuery pq = (PhraseQuery)query; - Term[] terms = pq.getTerms(); - Map map = subMap; - QueryPhraseMap qpm = null; - for( Term term : terms ){ - qpm = getOrNewMap( map, term.text() ); - map = qpm.subMap; - } - qpm.markTerminal( pq.getSlop(), pq.getBoost() ); - } - else - throw new RuntimeException( "query \"" + query.toString() + "\" must be flatten first." ); - } - - public QueryPhraseMap getTermMap( String term ){ - return subMap.get( term ); - } - - private void markTerminal( float boost ){ - markTerminal( 0, boost ); - } - - private void markTerminal( int slop, float boost ){ - this.terminal = true; - this.slop = slop; - this.boost = boost; - this.termOrPhraseNumber = fieldQuery.nextTermOrPhraseNumber(); - } - - public boolean isTerminal(){ - return terminal; - } - - public int getSlop(){ - return slop; - } - - public float getBoost(){ - return boost; - } - - public int getTermOrPhraseNumber(){ - return termOrPhraseNumber; - } - - public QueryPhraseMap searchPhrase( final List phraseCandidate ){ - QueryPhraseMap currMap = this; - for( TermInfo ti : phraseCandidate ){ - currMap = currMap.subMap.get( ti.getText() ); - if( currMap == null ) return null; - } - return currMap.isValidTermOrPhrase( phraseCandidate ) ? currMap : null; - } - - public boolean isValidTermOrPhrase( final List phraseCandidate ){ - // check terminal - if( !terminal ) return false; - - // if the candidate is a term, it is valid - if( phraseCandidate.size() == 1 ) return true; - - // else check whether the candidate is valid phrase - // compare position-gaps between terms to slop - int pos = phraseCandidate.get( 0 ).getPosition(); - for( int i = 1; i < phraseCandidate.size(); i++ ){ - int nextPos = phraseCandidate.get( i ).getPosition(); - if( Math.abs( nextPos - pos - 1 ) > slop ) return false; - pos = nextPos; - } - return true; - } - } -} Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/BaseFragmentsBuilder.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/BaseFragmentsBuilder.java (revision 956773) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/BaseFragmentsBuilder.java (working copy) @@ -1,154 +0,0 @@ -package org.apache.lucene.search.vectorhighlight; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; -import java.util.ArrayList; -import java.util.List; - -import org.apache.lucene.document.Document; -import org.apache.lucene.document.Field; -import org.apache.lucene.document.MapFieldSelector; -import org.apache.lucene.index.IndexReader; -import org.apache.lucene.search.vectorhighlight.FieldFragList.WeightedFragInfo; -import org.apache.lucene.search.vectorhighlight.FieldFragList.WeightedFragInfo.SubInfo; -import org.apache.lucene.search.vectorhighlight.FieldPhraseList.WeightedPhraseInfo.Toffs; - -public abstract class BaseFragmentsBuilder implements FragmentsBuilder { - - protected String[] preTags, postTags; - public static final String[] COLORED_PRE_TAGS = { - "", "", "", - "", "", "", - "", "", "", - "" - }; - public static final String[] COLORED_POST_TAGS = { "" }; - - protected BaseFragmentsBuilder(){ - this( new String[]{ "" }, new String[]{ "" } ); - } - - protected BaseFragmentsBuilder( String[] preTags, String[] postTags ){ - this.preTags = preTags; - this.postTags = postTags; - } - - static Object checkTagsArgument( Object tags ){ - if( tags instanceof String ) return tags; - else if( tags instanceof String[] ) return tags; - throw new IllegalArgumentException( "type of preTags/postTags must be a String or String[]" ); - } - - public abstract List getWeightedFragInfoList( List src ); - - public String createFragment( IndexReader reader, int docId, - String fieldName, FieldFragList fieldFragList ) throws IOException { - String[] fragments = createFragments( reader, docId, fieldName, fieldFragList, 1 ); - if( fragments == null || fragments.length == 0 ) return null; - return fragments[0]; - } - - public String[] createFragments( IndexReader reader, int docId, - String fieldName, FieldFragList fieldFragList, int maxNumFragments ) - throws IOException { - if( maxNumFragments < 0 ) - throw new IllegalArgumentException( "maxNumFragments(" + maxNumFragments + ") must be positive number." ); - - List fragInfos = getWeightedFragInfoList( fieldFragList.fragInfos ); - - List fragments = new ArrayList( maxNumFragments ); - Field[] values = getFields( reader, docId, fieldName ); - if( values.length == 0 ) return null; - StringBuilder buffer = new StringBuilder(); - int[] nextValueIndex = { 0 }; - for( int n = 0; n < maxNumFragments && n < fragInfos.size(); n++ ){ - WeightedFragInfo fragInfo = fragInfos.get( n ); - fragments.add( makeFragment( buffer, nextValueIndex, values, fragInfo ) ); - } - return fragments.toArray( new String[fragments.size()] ); - } - - @Deprecated - protected String[] getFieldValues( IndexReader reader, int docId, String fieldName) throws IOException { - Document doc = reader.document( docId, new MapFieldSelector( new String[]{ fieldName } ) ); - return doc.getValues( fieldName ); // according to Document class javadoc, this never returns null - } - - protected Field[] getFields( IndexReader reader, int docId, String fieldName) throws IOException { - // according to javadoc, doc.getFields(fieldName) cannot be used with lazy loaded field??? - Document doc = reader.document( docId, new MapFieldSelector( new String[]{ fieldName } ) ); - return doc.getFields( fieldName ); // according to Document class javadoc, this never returns null - } - - @Deprecated - protected String makeFragment( StringBuilder buffer, int[] index, String[] values, WeightedFragInfo fragInfo ){ - final int s = fragInfo.startOffset; - return makeFragment( fragInfo, getFragmentSource( buffer, index, values, s, fragInfo.endOffset ), s ); - } - - protected String makeFragment( StringBuilder buffer, int[] index, Field[] values, WeightedFragInfo fragInfo ){ - final int s = fragInfo.startOffset; - return makeFragment( fragInfo, getFragmentSource( buffer, index, values, s, fragInfo.endOffset ), s ); - } - - private String makeFragment( WeightedFragInfo fragInfo, String src, int s ){ - StringBuilder fragment = new StringBuilder(); - int srcIndex = 0; - for( SubInfo subInfo : fragInfo.subInfos ){ - for( Toffs to : subInfo.termsOffsets ){ - fragment.append( src.substring( srcIndex, to.startOffset - s ) ).append( getPreTag( subInfo.seqnum ) ) - .append( src.substring( to.startOffset - s, to.endOffset - s ) ).append( getPostTag( subInfo.seqnum ) ); - srcIndex = to.endOffset - s; - } - } - fragment.append( src.substring( srcIndex ) ); - return fragment.toString(); - } - - @Deprecated - protected String getFragmentSource( StringBuilder buffer, int[] index, String[] values, - int startOffset, int endOffset ){ - while( buffer.length() < endOffset && index[0] < values.length ){ - if( index[0] > 0 && values[index[0]].length() > 0 ) - buffer.append( ' ' ); - buffer.append( values[index[0]++] ); - } - int eo = buffer.length() < endOffset ? buffer.length() : endOffset; - return buffer.substring( startOffset, eo ); - } - - protected String getFragmentSource( StringBuilder buffer, int[] index, Field[] values, - int startOffset, int endOffset ){ - while( buffer.length() < endOffset && index[0] < values.length ){ - if( index[0] > 0 && values[index[0]].isTokenized() && values[index[0]].stringValue().length() > 0 ) - buffer.append( ' ' ); - buffer.append( values[index[0]++].stringValue() ); - } - int eo = buffer.length() < endOffset ? buffer.length() : endOffset; - return buffer.substring( startOffset, eo ); - } - - protected String getPreTag( int num ){ - return preTags.length > num ? preTags[num] : preTags[0]; - } - - protected String getPostTag( int num ){ - return postTags.length > num ? postTags[num] : postTags[0]; - } -} Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/ScoreOrderFragmentsBuilder.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/ScoreOrderFragmentsBuilder.java (revision 956773) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/ScoreOrderFragmentsBuilder.java (working copy) @@ -1,70 +0,0 @@ -package org.apache.lucene.search.vectorhighlight; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.util.Collections; -import java.util.Comparator; -import java.util.List; - -import org.apache.lucene.search.vectorhighlight.FieldFragList.WeightedFragInfo; - -/** - * An implementation of FragmentsBuilder that outputs score-order fragments. - */ -public class ScoreOrderFragmentsBuilder extends BaseFragmentsBuilder { - - /** - * a constructor. - */ - public ScoreOrderFragmentsBuilder(){ - super(); - } - - /** - * a constructor. - * - * @param preTags array of pre-tags for markup terms. - * @param postTags array of post-tags for markup terms. - */ - public ScoreOrderFragmentsBuilder( String[] preTags, String[] postTags ){ - super( preTags, postTags ); - } - - /** - * Sort by score the list of WeightedFragInfo - */ - @Override - public List getWeightedFragInfoList( List src ) { - Collections.sort( src, new ScoreComparator() ); - return src; - } - - public static class ScoreComparator implements Comparator { - - public int compare( WeightedFragInfo o1, WeightedFragInfo o2 ) { - if( o1.totalBoost > o2.totalBoost ) return -1; - else if( o1.totalBoost < o2.totalBoost ) return 1; - // if same score then check startOffset - else{ - if( o1.startOffset < o2.startOffset ) return -1; - else if( o1.startOffset > o2.startOffset ) return 1; - } - return 0; - } - } -} Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/package.html =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/package.html (revision 956773) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/package.html (working copy) @@ -1,143 +0,0 @@ - - - - -This is an another highlighter implementation. - -

Features

-
    -
  • fast for large docs
  • -
  • support N-gram fields
  • -
  • support phrase-unit highlighting with slops
  • -
  • need Java 1.5
  • -
  • highlight fields need to be TermVector.WITH_POSITIONS_OFFSETS
  • -
  • take into account query boost to score fragments
  • -
  • support colored highlight tags
  • -
  • pluggable FragListBuilder
  • -
  • pluggable FragmentsBuilder
  • -
- -

Algorithm

-

To explain the algorithm, let's use the following sample text - (to be highlighted) and user query:

- - - - - - - - - - -
Sample TextLucene is a search engine library.
User QueryLucene^2 OR "search library"~1
- -

The user query is a BooleanQuery that consists of TermQuery("Lucene") -with boost of 2 and PhraseQuery("search library") with slop of 1.

-

For your convenience, here is the offsets and positions info of the -sample text.

- -
-+--------+-----------------------------------+
-|        |          1111111111222222222233333|
-|  offset|01234567890123456789012345678901234|
-+--------+-----------------------------------+
-|document|Lucene is a search engine library. |
-+--------*-----------------------------------+
-|position|0      1  2 3      4      5        |
-+--------*-----------------------------------+
-
- -

Step 1.

-

In Step 1, Fast Vector Highlighter generates {@link org.apache.lucene.search.vectorhighlight.FieldQuery.QueryPhraseMap} from the user query. -QueryPhraseMap consists of the following members:

-
-public class QueryPhraseMap {
-  boolean terminal;
-  int slop;   // valid if terminal == true and phraseHighlight == true
-  float boost;  // valid if terminal == true
-  Map<String, QueryPhraseMap> subMap;
-} 
-
-

QueryPhraseMap has subMap. The key of the subMap is a term -text in the user query and the value is a subsequent QueryPhraseMap. -If the query is a term (not phrase), then the subsequent QueryPhraseMap -is marked as terminal. If the query is a phrase, then the subsequent QueryPhraseMap -is not a terminal and it has the next term text in the phrase.

- -

From the sample user query, the following QueryPhraseMap -will be generated:

-
-   QueryPhraseMap
-+--------+-+  +-------+-+
-|"Lucene"|o+->|boost=2|*|  * : terminal
-+--------+-+  +-------+-+
-
-+--------+-+  +---------+-+  +-------+------+-+
-|"search"|o+->|"library"|o+->|boost=1|slop=1|*|
-+--------+-+  +---------+-+  +-------+------+-+
-
- -

Step 2.

-

In Step 2, Fast Vector Highlighter generates {@link org.apache.lucene.search.vectorhighlight.FieldTermStack}. Fast Vector Highlighter uses {@link org.apache.lucene.index.TermFreqVector} data -(must be stored {@link org.apache.lucene.document.Field.TermVector#WITH_POSITIONS_OFFSETS}) -to generate it. FieldTermStack keeps the terms in the user query. -Therefore, in this sample case, Fast Vector Highlighter generates the following FieldTermStack:

-
-   FieldTermStack
-+------------------+
-|"Lucene"(0,6,0)   |
-+------------------+
-|"search"(12,18,3) |
-+------------------+
-|"library"(26,33,5)|
-+------------------+
-where : "termText"(startOffset,endOffset,position)
-
-

Step 3.

-

In Step 3, Fast Vector Highlighter generates {@link org.apache.lucene.search.vectorhighlight.FieldPhraseList} -by reference to QueryPhraseMap and FieldTermStack.

-
-   FieldPhraseList
-+----------------+-----------------+---+
-|"Lucene"        |[(0,6)]          |w=2|
-+----------------+-----------------+---+
-|"search library"|[(12,18),(26,33)]|w=1|
-+----------------+-----------------+---+
-
-

The type of each entry is WeightedPhraseInfo that consists of -an array of terms offsets and weight. The weight (Fast Vector Highlighter uses query boost to -calculate the weight) will be taken into account when Fast Vector Highlighter creates -{@link org.apache.lucene.search.vectorhighlight.FieldFragList} in the next step.

-

Step 4.

-

In Step 4, Fast Vector Highlighter creates FieldFragList by reference to -FieldPhraseList. In this sample case, the following -FieldFragList will be generated:

-
-   FieldFragList
-+---------------------------------+
-|"Lucene"[(0,6)]                  |
-|"search library"[(12,18),(26,33)]|
-|totalBoost=3                     |
-+---------------------------------+
-
-

Step 5.

-

In Step 5, by using FieldFragList and the field stored data, -Fast Vector Highlighter creates highlighted snippets!

- - Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/SimpleFragmentsBuilder.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/SimpleFragmentsBuilder.java (revision 956773) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/SimpleFragmentsBuilder.java (working copy) @@ -1,54 +0,0 @@ -package org.apache.lucene.search.vectorhighlight; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.util.List; - -import org.apache.lucene.search.vectorhighlight.FieldFragList.WeightedFragInfo; - -/** - * A simple implementation of FragmentsBuilder. - * - */ -public class SimpleFragmentsBuilder extends BaseFragmentsBuilder { - - /** - * a constructor. - */ - public SimpleFragmentsBuilder() { - super(); - } - - /** - * a constructor. - * - * @param preTags array of pre-tags for markup terms. - * @param postTags array of post-tags for markup terms. - */ - public SimpleFragmentsBuilder( String[] preTags, String[] postTags ) { - super( preTags, postTags ); - } - - /** - * do nothing. return the source list. - */ - @Override - public List getWeightedFragInfoList( List src ) { - return src; - } -} Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FastVectorHighlighter.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FastVectorHighlighter.java (revision 956773) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FastVectorHighlighter.java (working copy) @@ -1,137 +0,0 @@ -package org.apache.lucene.search.vectorhighlight; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; - -import org.apache.lucene.index.IndexReader; -import org.apache.lucene.search.Query; - -/** - * Another highlighter implementation. - * - */ -public class FastVectorHighlighter { - - public static final boolean DEFAULT_PHRASE_HIGHLIGHT = true; - public static final boolean DEFAULT_FIELD_MATCH = true; - private final boolean phraseHighlight; - private final boolean fieldMatch; - private final FragListBuilder fragListBuilder; - private final FragmentsBuilder fragmentsBuilder; - - /** - * the default constructor. - */ - public FastVectorHighlighter(){ - this( DEFAULT_PHRASE_HIGHLIGHT, DEFAULT_FIELD_MATCH ); - } - - /** - * a constructor. Using SimpleFragListBuilder and ScoreOrderFragmentsBuilder. - * - * @param phraseHighlight true or false for phrase highlighting - * @param fieldMatch true of false for field matching - */ - public FastVectorHighlighter( boolean phraseHighlight, boolean fieldMatch ){ - this( phraseHighlight, fieldMatch, new SimpleFragListBuilder(), new ScoreOrderFragmentsBuilder() ); - } - - /** - * a constructor. A FragListBuilder and a FragmentsBuilder can be specified (plugins). - * - * @param phraseHighlight true of false for phrase highlighting - * @param fieldMatch true of false for field matching - * @param fragListBuilder an instance of FragListBuilder - * @param fragmentsBuilder an instance of FragmentsBuilder - */ - public FastVectorHighlighter( boolean phraseHighlight, boolean fieldMatch, - FragListBuilder fragListBuilder, FragmentsBuilder fragmentsBuilder ){ - this.phraseHighlight = phraseHighlight; - this.fieldMatch = fieldMatch; - this.fragListBuilder = fragListBuilder; - this.fragmentsBuilder = fragmentsBuilder; - } - - /** - * create a FieldQuery object. - * - * @param query a query - * @return the created FieldQuery object - */ - public FieldQuery getFieldQuery( Query query ){ - return new FieldQuery( query, phraseHighlight, fieldMatch ); - } - - /** - * return the best fragment. - * - * @param fieldQuery FieldQuery object - * @param reader IndexReader of the index - * @param docId document id to be highlighted - * @param fieldName field of the document to be highlighted - * @param fragCharSize the length (number of chars) of a fragment - * @return the best fragment (snippet) string - * @throws IOException - */ - public final String getBestFragment( final FieldQuery fieldQuery, IndexReader reader, int docId, - String fieldName, int fragCharSize ) throws IOException { - FieldFragList fieldFragList = getFieldFragList( fieldQuery, reader, docId, fieldName, fragCharSize ); - return fragmentsBuilder.createFragment( reader, docId, fieldName, fieldFragList ); - } - - /** - * return the best fragments. - * - * @param fieldQuery FieldQuery object - * @param reader IndexReader of the index - * @param docId document id to be highlighted - * @param fieldName field of the document to be highlighted - * @param fragCharSize the length (number of chars) of a fragment - * @param maxNumFragments maximum number of fragments - * @return created fragments or null when no fragments created. - * size of the array can be less than maxNumFragments - * @throws IOException - */ - public final String[] getBestFragments( final FieldQuery fieldQuery, IndexReader reader, int docId, - String fieldName, int fragCharSize, int maxNumFragments ) throws IOException { - FieldFragList fieldFragList = getFieldFragList( fieldQuery, reader, docId, fieldName, fragCharSize ); - return fragmentsBuilder.createFragments( reader, docId, fieldName, fieldFragList, maxNumFragments ); - } - - private FieldFragList getFieldFragList( final FieldQuery fieldQuery, IndexReader reader, int docId, - String fieldName, int fragCharSize ) throws IOException { - FieldTermStack fieldTermStack = new FieldTermStack( reader, docId, fieldName, fieldQuery ); - FieldPhraseList fieldPhraseList = new FieldPhraseList( fieldTermStack, fieldQuery ); - return fragListBuilder.createFieldFragList( fieldPhraseList, fragCharSize ); - } - - /** - * return whether phraseHighlight or not. - * - * @return whether phraseHighlight or not - */ - public boolean isPhraseHighlight(){ return phraseHighlight; } - - /** - * return whether fieldMatch or not. - * - * @return whether fieldMatch or not - */ - public boolean isFieldMatch(){ return fieldMatch; } -} Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FragListBuilder.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FragListBuilder.java (revision 956773) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FragListBuilder.java (working copy) @@ -1,34 +0,0 @@ -package org.apache.lucene.search.vectorhighlight; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * FragListBuilder is an interface for FieldFragList builder classes. - * A FragListBuilder class can be plugged in to Highlighter. - */ -public interface FragListBuilder { - - /** - * create a FieldFragList. - * - * @param fieldPhraseList FieldPhraseList object - * @param fragCharSize the length (number of chars) of a fragment - * @return the created FieldFragList object - */ - public FieldFragList createFieldFragList( FieldPhraseList fieldPhraseList, int fragCharSize ); -} Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldPhraseList.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldPhraseList.java (revision 956773) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldPhraseList.java (working copy) @@ -1,191 +0,0 @@ -package org.apache.lucene.search.vectorhighlight; -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.util.ArrayList; -import java.util.LinkedList; -import java.util.List; - -import org.apache.lucene.search.vectorhighlight.FieldQuery.QueryPhraseMap; -import org.apache.lucene.search.vectorhighlight.FieldTermStack.TermInfo; - -/** - * FieldPhraseList has a list of WeightedPhraseInfo that is used by FragListBuilder - * to create a FieldFragList object. - */ -public class FieldPhraseList { - - LinkedList phraseList = new LinkedList(); - - /** - * a constructor. - * - * @param fieldTermStack FieldTermStack object - * @param fieldQuery FieldQuery object - */ - public FieldPhraseList( FieldTermStack fieldTermStack, FieldQuery fieldQuery ){ - final String field = fieldTermStack.getFieldName(); - - LinkedList phraseCandidate = new LinkedList(); - QueryPhraseMap currMap = null; - QueryPhraseMap nextMap = null; - while( !fieldTermStack.isEmpty() ){ - - phraseCandidate.clear(); - - TermInfo ti = fieldTermStack.pop(); - currMap = fieldQuery.getFieldTermMap( field, ti.getText() ); - - // if not found, discard top TermInfo from stack, then try next element - if( currMap == null ) continue; - - // if found, search the longest phrase - phraseCandidate.add( ti ); - while( true ){ - ti = fieldTermStack.pop(); - nextMap = null; - if( ti != null ) - nextMap = currMap.getTermMap( ti.getText() ); - if( ti == null || nextMap == null ){ - if( ti != null ) - fieldTermStack.push( ti ); - if( currMap.isValidTermOrPhrase( phraseCandidate ) ){ - addIfNoOverlap( new WeightedPhraseInfo( phraseCandidate, currMap.getBoost(), currMap.getTermOrPhraseNumber() ) ); - } - else{ - while( phraseCandidate.size() > 1 ){ - fieldTermStack.push( phraseCandidate.removeLast() ); - currMap = fieldQuery.searchPhrase( field, phraseCandidate ); - if( currMap != null ){ - addIfNoOverlap( new WeightedPhraseInfo( phraseCandidate, currMap.getBoost(), currMap.getTermOrPhraseNumber() ) ); - break; - } - } - } - break; - } - else{ - phraseCandidate.add( ti ); - currMap = nextMap; - } - } - } - } - - void addIfNoOverlap( WeightedPhraseInfo wpi ){ - for( WeightedPhraseInfo existWpi : phraseList ){ - if( existWpi.isOffsetOverlap( wpi ) ) return; - } - phraseList.add( wpi ); - } - - public static class WeightedPhraseInfo { - - String text; // unnecessary member, just exists for debugging purpose - List termsOffsets; // usually termsOffsets.size() == 1, - // but if position-gap > 1 and slop > 0 then size() could be greater than 1 - float boost; // query boost - int seqnum; - - public WeightedPhraseInfo( LinkedList terms, float boost ){ - this( terms, boost, 0 ); - } - - public WeightedPhraseInfo( LinkedList terms, float boost, int number ){ - this.boost = boost; - this.seqnum = number; - termsOffsets = new ArrayList( terms.size() ); - TermInfo ti = terms.get( 0 ); - termsOffsets.add( new Toffs( ti.getStartOffset(), ti.getEndOffset() ) ); - if( terms.size() == 1 ){ - text = ti.getText(); - return; - } - StringBuilder sb = new StringBuilder(); - sb.append( ti.getText() ); - int pos = ti.getPosition(); - for( int i = 1; i < terms.size(); i++ ){ - ti = terms.get( i ); - sb.append( ti.getText() ); - if( ti.getPosition() - pos == 1 ){ - Toffs to = termsOffsets.get( termsOffsets.size() - 1 ); - to.setEndOffset( ti.getEndOffset() ); - } - else{ - termsOffsets.add( new Toffs( ti.getStartOffset(), ti.getEndOffset() ) ); - } - pos = ti.getPosition(); - } - text = sb.toString(); - } - - public int getStartOffset(){ - return termsOffsets.get( 0 ).startOffset; - } - - public int getEndOffset(){ - return termsOffsets.get( termsOffsets.size() - 1 ).endOffset; - } - - public boolean isOffsetOverlap( WeightedPhraseInfo other ){ - int so = getStartOffset(); - int eo = getEndOffset(); - int oso = other.getStartOffset(); - int oeo = other.getEndOffset(); - if( so <= oso && oso < eo ) return true; - if( so < oeo && oeo <= eo ) return true; - if( oso <= so && so < oeo ) return true; - if( oso < eo && eo <= oeo ) return true; - return false; - } - - @Override - public String toString(){ - StringBuilder sb = new StringBuilder(); - sb.append( text ).append( '(' ).append( boost ).append( ")(" ); - for( Toffs to : termsOffsets ){ - sb.append( to ); - } - sb.append( ')' ); - return sb.toString(); - } - - public static class Toffs { - int startOffset; - int endOffset; - public Toffs( int startOffset, int endOffset ){ - this.startOffset = startOffset; - this.endOffset = endOffset; - } - public void setEndOffset( int endOffset ){ - this.endOffset = endOffset; - } - public int getStartOffset(){ - return startOffset; - } - public int getEndOffset(){ - return endOffset; - } - @Override - public String toString(){ - StringBuilder sb = new StringBuilder(); - sb.append( '(' ).append( startOffset ).append( ',' ).append( endOffset ).append( ')' ); - return sb.toString(); - } - } - } -} Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FragmentsBuilder.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FragmentsBuilder.java (revision 956773) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FragmentsBuilder.java (working copy) @@ -1,57 +0,0 @@ -package org.apache.lucene.search.vectorhighlight; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; - -import org.apache.lucene.index.IndexReader; - -/** - * FragmentsBuilder is an interface for fragments (snippets) builder classes. - * A FragmentsBuilder class can be plugged in to Highlighter. - */ -public interface FragmentsBuilder { - - /** - * create a fragment. - * - * @param reader IndexReader of the index - * @param docId document id to be highlighted - * @param fieldName field of the document to be highlighted - * @param fieldFragList FieldFragList object - * @return a created fragment or null when no fragment created - * @throws IOException - */ - public String createFragment( IndexReader reader, int docId, String fieldName, - FieldFragList fieldFragList ) throws IOException; - - /** - * create multiple fragments. - * - * @param reader IndexReader of the index - * @param docId document id to be highlighter - * @param fieldName field of the document to be highlighted - * @param fieldFragList FieldFragList object - * @param maxNumFragments maximum number of fragments - * @return created fragments or null when no fragments created. - * size of the array can be less than maxNumFragments - * @throws IOException - */ - public String[] createFragments( IndexReader reader, int docId, String fieldName, - FieldFragList fieldFragList, int maxNumFragments ) throws IOException; -} Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/SimpleFragListBuilder.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/SimpleFragListBuilder.java (revision 956773) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/SimpleFragListBuilder.java (working copy) @@ -1,84 +0,0 @@ -package org.apache.lucene.search.vectorhighlight; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.util.ArrayList; -import java.util.Iterator; -import java.util.List; - -import org.apache.lucene.search.vectorhighlight.FieldPhraseList.WeightedPhraseInfo; - -/** - * A simple implementation of FragListBuilder. - */ -public class SimpleFragListBuilder implements FragListBuilder { - - public static final int MARGIN = 6; - public static final int MIN_FRAG_CHAR_SIZE = MARGIN * 3; - - public FieldFragList createFieldFragList(FieldPhraseList fieldPhraseList, int fragCharSize) { - if( fragCharSize < MIN_FRAG_CHAR_SIZE ) - throw new IllegalArgumentException( "fragCharSize(" + fragCharSize + ") is too small. It must be " + - MIN_FRAG_CHAR_SIZE + " or higher." ); - - FieldFragList ffl = new FieldFragList( fragCharSize ); - - List wpil = new ArrayList(); - Iterator ite = fieldPhraseList.phraseList.iterator(); - WeightedPhraseInfo phraseInfo = null; - int startOffset = 0; - boolean taken = false; - while( true ){ - if( !taken ){ - if( !ite.hasNext() ) break; - phraseInfo = ite.next(); - } - taken = false; - if( phraseInfo == null ) break; - - // if the phrase violates the border of previous fragment, discard it and try next phrase - if( phraseInfo.getStartOffset() < startOffset ) continue; - - wpil.clear(); - wpil.add( phraseInfo ); - int st = phraseInfo.getStartOffset() - MARGIN < startOffset ? - startOffset : phraseInfo.getStartOffset() - MARGIN; - int en = st + fragCharSize; - if( phraseInfo.getEndOffset() > en ) - en = phraseInfo.getEndOffset(); - startOffset = en; - - while( true ){ - if( ite.hasNext() ){ - phraseInfo = ite.next(); - taken = true; - if( phraseInfo == null ) break; - } - else - break; - if( phraseInfo.getEndOffset() <= en ) - wpil.add( phraseInfo ); - else - break; - } - ffl.add( st, en, wpil ); - } - return ffl; - } - -} Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldFragList.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldFragList.java (revision 956773) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldFragList.java (working copy) @@ -1,128 +0,0 @@ -package org.apache.lucene.search.vectorhighlight; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.util.ArrayList; -import java.util.List; - -import org.apache.lucene.search.vectorhighlight.FieldPhraseList.WeightedPhraseInfo; -import org.apache.lucene.search.vectorhighlight.FieldPhraseList.WeightedPhraseInfo.Toffs; - -/** - * FieldFragList has a list of "frag info" that is used by FragmentsBuilder class - * to create fragments (snippets). - */ -public class FieldFragList { - - List fragInfos = new ArrayList(); - - /** - * a constructor. - * - * @param fragCharSize the length (number of chars) of a fragment - */ - public FieldFragList( int fragCharSize ){ - } - - /** - * convert the list of WeightedPhraseInfo to WeightedFragInfo, then add it to the fragInfos - * - * @param startOffset start offset of the fragment - * @param endOffset end offset of the fragment - * @param phraseInfoList list of WeightedPhraseInfo objects - */ - public void add( int startOffset, int endOffset, List phraseInfoList ){ - fragInfos.add( new WeightedFragInfo( startOffset, endOffset, phraseInfoList ) ); - } - - public static class WeightedFragInfo { - - List subInfos; - float totalBoost; - int startOffset; - int endOffset; - - public WeightedFragInfo( int startOffset, int endOffset, List phraseInfoList ){ - this.startOffset = startOffset; - this.endOffset = endOffset; - subInfos = new ArrayList(); - for( WeightedPhraseInfo phraseInfo : phraseInfoList ){ - SubInfo subInfo = new SubInfo( phraseInfo.text, phraseInfo.termsOffsets, phraseInfo.seqnum ); - subInfos.add( subInfo ); - totalBoost += phraseInfo.boost; - } - } - - public List getSubInfos(){ - return subInfos; - } - - public float getTotalBoost(){ - return totalBoost; - } - - public int getStartOffset(){ - return startOffset; - } - - public int getEndOffset(){ - return endOffset; - } - - @Override - public String toString(){ - StringBuilder sb = new StringBuilder(); - sb.append( "subInfos=(" ); - for( SubInfo si : subInfos ) - sb.append( si.toString() ); - sb.append( ")/" ).append( totalBoost ).append( '(' ).append( startOffset ).append( ',' ).append( endOffset ).append( ')' ); - return sb.toString(); - } - - public static class SubInfo { - final String text; // unnecessary member, just exists for debugging purpose - final List termsOffsets; // usually termsOffsets.size() == 1, - // but if position-gap > 1 and slop > 0 then size() could be greater than 1 - int seqnum; - - SubInfo( String text, List termsOffsets, int seqnum ){ - this.text = text; - this.termsOffsets = termsOffsets; - this.seqnum = seqnum; - } - - public List getTermsOffsets(){ - return termsOffsets; - } - - public int getSeqnum(){ - return seqnum; - } - - @Override - public String toString(){ - StringBuilder sb = new StringBuilder(); - sb.append( text ).append( '(' ); - for( Toffs to : termsOffsets ) - sb.append( to.toString() ); - sb.append( ')' ); - return sb.toString(); - } - } - } -}