();
- query.extractTerms(nonWeightedTerms);
-
- for (final Term queryTerm : nonWeightedTerms) {
-
- if (fieldNameComparator(queryTerm.field())) {
- WeightedSpanTerm weightedSpanTerm = new WeightedSpanTerm(query.getBoost(), queryTerm.text());
- terms.put(queryTerm.text(), weightedSpanTerm);
- }
- }
- }
-
- /**
- * Necessary to implement matches for queries against defaultField
- */
- private boolean fieldNameComparator(String fieldNameToCheck) {
- boolean rv = fieldName == null || fieldNameToCheck == fieldName
- || fieldNameToCheck == defaultField;
- return rv;
- }
-
- private IndexReader getReaderForField(String field) throws IOException {
- if(wrapToCaching && !cachedTokenStream && !(tokenStream instanceof CachingTokenFilter)) {
- tokenStream = new CachingTokenFilter(tokenStream);
- cachedTokenStream = true;
- }
- IndexReader reader = readers.get(field);
- if (reader == null) {
- MemoryIndex indexer = new MemoryIndex();
- indexer.addField(field, tokenStream);
- tokenStream.reset();
- IndexSearcher searcher = indexer.createSearcher();
- reader = searcher.getIndexReader();
- readers.put(field, reader);
- }
-
- return reader;
- }
-
- /**
- * Creates a Map of WeightedSpanTerms from the given Query and TokenStream.
- *
- *
- *
- * @param query
- * that caused hit
- * @param tokenStream
- * of text to be highlighted
- * @return Map containing WeightedSpanTerms
- * @throws IOException
- */
- public Map getWeightedSpanTerms(Query query, TokenStream tokenStream)
- throws IOException {
- return getWeightedSpanTerms(query, tokenStream, null);
- }
-
- /**
- * Creates a Map of WeightedSpanTerms from the given Query and TokenStream.
- *
- *
- *
- * @param query
- * that caused hit
- * @param tokenStream
- * of text to be highlighted
- * @param fieldName
- * restricts Term's used based on field name
- * @return Map containing WeightedSpanTerms
- * @throws IOException
- */
- public Map getWeightedSpanTerms(Query query, TokenStream tokenStream,
- String fieldName) throws IOException {
- if (fieldName != null) {
- this.fieldName = StringHelper.intern(fieldName);
- } else {
- this.fieldName = null;
- }
-
- Map terms = new PositionCheckingMap();
- this.tokenStream = tokenStream;
- try {
- extract(query, terms);
- } finally {
- closeReaders();
- }
-
- return terms;
- }
-
- /**
- * Creates a Map of WeightedSpanTerms from the given Query and TokenStream. Uses a supplied
- * IndexReader to properly weight terms (for gradient highlighting).
- *
- *
- *
- * @param query
- * that caused hit
- * @param tokenStream
- * of text to be highlighted
- * @param fieldName
- * restricts Term's used based on field name
- * @param reader
- * to use for scoring
- * @return Map of WeightedSpanTerms with quasi tf/idf scores
- * @throws IOException
- */
- public Map getWeightedSpanTermsWithScores(Query query, TokenStream tokenStream, String fieldName,
- IndexReader reader) throws IOException {
- if (fieldName != null) {
- this.fieldName = StringHelper.intern(fieldName);
- } else {
- this.fieldName = null;
- }
- this.tokenStream = tokenStream;
-
- Map terms = new PositionCheckingMap();
- extract(query, terms);
-
- int totalNumDocs = reader.numDocs();
- Set weightedTerms = terms.keySet();
- Iterator it = weightedTerms.iterator();
-
- try {
- while (it.hasNext()) {
- WeightedSpanTerm weightedSpanTerm = terms.get(it.next());
- int docFreq = reader.docFreq(new Term(fieldName, weightedSpanTerm.term));
- // docFreq counts deletes
- if(totalNumDocs < docFreq) {
- docFreq = totalNumDocs;
- }
- // IDF algorithm taken from DefaultSimilarity class
- float idf = (float) (Math.log((float) totalNumDocs / (double) (docFreq + 1)) + 1.0);
- weightedSpanTerm.weight *= idf;
- }
- } finally {
-
- closeReaders();
- }
-
- return terms;
- }
-
- private void collectSpanQueryFields(SpanQuery spanQuery, Set fieldNames) {
- if (spanQuery instanceof FieldMaskingSpanQuery) {
- collectSpanQueryFields(((FieldMaskingSpanQuery)spanQuery).getMaskedQuery(), fieldNames);
- } else if (spanQuery instanceof SpanFirstQuery) {
- collectSpanQueryFields(((SpanFirstQuery)spanQuery).getMatch(), fieldNames);
- } else if (spanQuery instanceof SpanNearQuery) {
- for (final SpanQuery clause : ((SpanNearQuery)spanQuery).getClauses()) {
- collectSpanQueryFields(clause, fieldNames);
- }
- } else if (spanQuery instanceof SpanNotQuery) {
- collectSpanQueryFields(((SpanNotQuery)spanQuery).getInclude(), fieldNames);
- } else if (spanQuery instanceof SpanOrQuery) {
- for (final SpanQuery clause : ((SpanOrQuery)spanQuery).getClauses()) {
- collectSpanQueryFields(clause, fieldNames);
- }
- } else {
- fieldNames.add(spanQuery.getField());
- }
- }
-
- private boolean mustRewriteQuery(SpanQuery spanQuery) {
- if (!expandMultiTermQuery) {
- return false; // Will throw UnsupportedOperationException in case of a SpanRegexQuery.
- } else if (spanQuery instanceof FieldMaskingSpanQuery) {
- return mustRewriteQuery(((FieldMaskingSpanQuery)spanQuery).getMaskedQuery());
- } else if (spanQuery instanceof SpanFirstQuery) {
- return mustRewriteQuery(((SpanFirstQuery)spanQuery).getMatch());
- } else if (spanQuery instanceof SpanNearQuery) {
- for (final SpanQuery clause : ((SpanNearQuery)spanQuery).getClauses()) {
- if (mustRewriteQuery(clause)) {
- return true;
- }
- }
- return false;
- } else if (spanQuery instanceof SpanNotQuery) {
- SpanNotQuery spanNotQuery = (SpanNotQuery)spanQuery;
- return mustRewriteQuery(spanNotQuery.getInclude()) || mustRewriteQuery(spanNotQuery.getExclude());
- } else if (spanQuery instanceof SpanOrQuery) {
- for (final SpanQuery clause : ((SpanOrQuery)spanQuery).getClauses()) {
- if (mustRewriteQuery(clause)) {
- return true;
- }
- }
- return false;
- } else if (spanQuery instanceof SpanTermQuery) {
- return false;
- } else {
- return true;
- }
- }
-
- /**
- * This class makes sure that if both position sensitive and insensitive
- * versions of the same term are added, the position insensitive one wins.
- */
- static private class PositionCheckingMap extends HashMap {
-
- @Override
- public void putAll(Map m) {
- Iterator> it = m.entrySet().iterator();
- while (it.hasNext()) {
- Map.Entry entry = it.next();
- this.put(entry.getKey(), entry.getValue());
- }
- }
-
- @Override
- public WeightedSpanTerm put(K key, WeightedSpanTerm value) {
- WeightedSpanTerm prev = super.put(key, value);
- if (prev == null) return prev;
- WeightedSpanTerm prevTerm = prev;
- WeightedSpanTerm newTerm = value;
- if (!prevTerm.positionSensitive) {
- newTerm.positionSensitive = false;
- }
- return prev;
- }
-
- }
-
- public boolean getExpandMultiTermQuery() {
- return expandMultiTermQuery;
- }
-
- public void setExpandMultiTermQuery(boolean expandMultiTermQuery) {
- this.expandMultiTermQuery = expandMultiTermQuery;
- }
-
- public boolean isCachedTokenStream() {
- return cachedTokenStream;
- }
-
- public TokenStream getTokenStream() {
- return tokenStream;
- }
-
- /**
- * By default, {@link TokenStream}s that are not of the type
- * {@link CachingTokenFilter} are wrapped in a {@link CachingTokenFilter} to
- * ensure an efficient reset - if you are already using a different caching
- * {@link TokenStream} impl and you don't want it to be wrapped, set this to
- * false.
- *
- * @param wrap
- */
- public void setWrapIfNotCachingTokenFilter(boolean wrap) {
- this.wrapToCaching = wrap;
- }
-}
Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java
===================================================================
--- lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java (revision 956773)
+++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java (working copy)
@@ -1,285 +0,0 @@
-/*
- * Created on 28-Oct-2004
- */
-package org.apache.lucene.search.highlight;
-
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.io.IOException;
-import java.io.StringReader;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Comparator;
-
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.Token;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
-import org.apache.lucene.document.Document;
-import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.TermFreqVector;
-import org.apache.lucene.index.TermPositionVector;
-import org.apache.lucene.index.TermVectorOffsetInfo;
-
-/**
- * Hides implementation issues associated with obtaining a TokenStream for use
- * with the higlighter - can obtain from TermFreqVectors with offsets and
- * (optionally) positions or from Analyzer class reparsing the stored content.
- */
-public class TokenSources {
- /**
- * A convenience method that tries to first get a TermPositionVector for the
- * specified docId, then, falls back to using the passed in
- * {@link org.apache.lucene.document.Document} to retrieve the TokenStream.
- * This is useful when you already have the document, but would prefer to use
- * the vector first.
- *
- * @param reader The {@link org.apache.lucene.index.IndexReader} to use to try
- * and get the vector from
- * @param docId The docId to retrieve.
- * @param field The field to retrieve on the document
- * @param doc The document to fall back on
- * @param analyzer The analyzer to use for creating the TokenStream if the
- * vector doesn't exist
- * @return The {@link org.apache.lucene.analysis.TokenStream} for the
- * {@link org.apache.lucene.document.Fieldable} on the
- * {@link org.apache.lucene.document.Document}
- * @throws IOException if there was an error loading
- */
- public static TokenStream getAnyTokenStream(IndexReader reader, int docId,
- String field, Document doc, Analyzer analyzer) throws IOException {
- TokenStream ts = null;
-
- TermFreqVector tfv = reader.getTermFreqVector(docId, field);
- if (tfv != null) {
- if (tfv instanceof TermPositionVector) {
- ts = getTokenStream((TermPositionVector) tfv);
- }
- }
- // No token info stored so fall back to analyzing raw content
- if (ts == null) {
- ts = getTokenStream(doc, field, analyzer);
- }
- return ts;
- }
-
- /**
- * A convenience method that tries a number of approaches to getting a token
- * stream. The cost of finding there are no termVectors in the index is
- * minimal (1000 invocations still registers 0 ms). So this "lazy" (flexible?)
- * approach to coding is probably acceptable
- *
- * @param reader
- * @param docId
- * @param field
- * @param analyzer
- * @return null if field not stored correctly
- * @throws IOException
- */
- public static TokenStream getAnyTokenStream(IndexReader reader, int docId,
- String field, Analyzer analyzer) throws IOException {
- TokenStream ts = null;
-
- TermFreqVector tfv = reader.getTermFreqVector(docId, field);
- if (tfv != null) {
- if (tfv instanceof TermPositionVector) {
- ts = getTokenStream((TermPositionVector) tfv);
- }
- }
- // No token info stored so fall back to analyzing raw content
- if (ts == null) {
- ts = getTokenStream(reader, docId, field, analyzer);
- }
- return ts;
- }
-
- public static TokenStream getTokenStream(TermPositionVector tpv) {
- // assumes the worst and makes no assumptions about token position
- // sequences.
- return getTokenStream(tpv, false);
- }
-
- /**
- * Low level api. Returns a token stream or null if no offset info available
- * in index. This can be used to feed the highlighter with a pre-parsed token
- * stream
- *
- * In my tests the speeds to recreate 1000 token streams using this method
- * are: - with TermVector offset only data stored - 420 milliseconds - with
- * TermVector offset AND position data stored - 271 milliseconds (nb timings
- * for TermVector with position data are based on a tokenizer with contiguous
- * positions - no overlaps or gaps) The cost of not using TermPositionVector
- * to store pre-parsed content and using an analyzer to re-parse the original
- * content: - reanalyzing the original content - 980 milliseconds
- *
- * The re-analyze timings will typically vary depending on - 1) The complexity
- * of the analyzer code (timings above were using a
- * stemmer/lowercaser/stopword combo) 2) The number of other fields (Lucene
- * reads ALL fields off the disk when accessing just one document field - can
- * cost dear!) 3) Use of compression on field storage - could be faster due to
- * compression (less disk IO) or slower (more CPU burn) depending on the
- * content.
- *
- * @param tpv
- * @param tokenPositionsGuaranteedContiguous true if the token position
- * numbers have no overlaps or gaps. If looking to eek out the last
- * drops of performance, set to true. If in doubt, set to false.
- */
- public static TokenStream getTokenStream(TermPositionVector tpv,
- boolean tokenPositionsGuaranteedContiguous) {
- if (!tokenPositionsGuaranteedContiguous && tpv.getTermPositions(0) != null) {
- return new TokenStreamFromTermPositionVector(tpv);
- }
-
- // an object used to iterate across an array of tokens
- final class StoredTokenStream extends TokenStream {
- Token tokens[];
-
- int currentToken = 0;
-
- CharTermAttribute termAtt;
-
- OffsetAttribute offsetAtt;
-
- StoredTokenStream(Token tokens[]) {
- this.tokens = tokens;
- termAtt = addAttribute(CharTermAttribute.class);
- offsetAtt = addAttribute(OffsetAttribute.class);
- }
-
- @Override
- public boolean incrementToken() throws IOException {
- if (currentToken >= tokens.length) {
- return false;
- }
- Token token = tokens[currentToken++];
- clearAttributes();
- termAtt.setEmpty().append(token);
- offsetAtt.setOffset(token.startOffset(), token.endOffset());
- return true;
- }
- }
- // code to reconstruct the original sequence of Tokens
- String[] terms = tpv.getTerms();
- int[] freq = tpv.getTermFrequencies();
- int totalTokens = 0;
-
- for (int t = 0; t < freq.length; t++) {
- totalTokens += freq[t];
- }
- Token tokensInOriginalOrder[] = new Token[totalTokens];
- ArrayList unsortedTokens = null;
- for (int t = 0; t < freq.length; t++) {
- TermVectorOffsetInfo[] offsets = tpv.getOffsets(t);
- if (offsets == null) {
- throw new IllegalArgumentException("Required TermVector Offset information was not found");
- }
-
- int[] pos = null;
- if (tokenPositionsGuaranteedContiguous) {
- // try get the token position info to speed up assembly of tokens into
- // sorted sequence
- pos = tpv.getTermPositions(t);
- }
- if (pos == null) {
- // tokens NOT stored with positions or not guaranteed contiguous - must
- // add to list and sort later
- if (unsortedTokens == null) {
- unsortedTokens = new ArrayList();
- }
- for (int tp = 0; tp < offsets.length; tp++) {
- Token token = new Token(terms[t], offsets[tp].getStartOffset(), offsets[tp]
- .getEndOffset());
- unsortedTokens.add(token);
- }
- } else {
- // We have positions stored and a guarantee that the token position
- // information is contiguous
-
- // This may be fast BUT wont work if Tokenizers used which create >1
- // token in same position or
- // creates jumps in position numbers - this code would fail under those
- // circumstances
-
- // tokens stored with positions - can use this to index straight into
- // sorted array
- for (int tp = 0; tp < pos.length; tp++) {
- Token token = new Token(terms[t], offsets[tp].getStartOffset(),
- offsets[tp].getEndOffset());
- tokensInOriginalOrder[pos[tp]] = token;
- }
- }
- }
- // If the field has been stored without position data we must perform a sort
- if (unsortedTokens != null) {
- tokensInOriginalOrder = unsortedTokens.toArray(new Token[unsortedTokens
- .size()]);
- Arrays.sort(tokensInOriginalOrder, new Comparator() {
- public int compare(Token t1, Token t2) {
- if (t1.startOffset() > t2.endOffset())
- return 1;
- if (t1.startOffset() < t2.startOffset())
- return -1;
- return 0;
- }
- });
- }
- return new StoredTokenStream(tokensInOriginalOrder);
- }
-
- public static TokenStream getTokenStream(IndexReader reader, int docId,
- String field) throws IOException {
- TermFreqVector tfv = reader.getTermFreqVector(docId, field);
- if (tfv == null) {
- throw new IllegalArgumentException(field + " in doc #" + docId
- + "does not have any term position data stored");
- }
- if (tfv instanceof TermPositionVector) {
- TermPositionVector tpv = (TermPositionVector) reader.getTermFreqVector(
- docId, field);
- return getTokenStream(tpv);
- }
- throw new IllegalArgumentException(field + " in doc #" + docId
- + "does not have any term position data stored");
- }
-
- // convenience method
- public static TokenStream getTokenStream(IndexReader reader, int docId,
- String field, Analyzer analyzer) throws IOException {
- Document doc = reader.document(docId);
- return getTokenStream(doc, field, analyzer);
- }
-
- public static TokenStream getTokenStream(Document doc, String field,
- Analyzer analyzer) {
- String contents = doc.get(field);
- if (contents == null) {
- throw new IllegalArgumentException("Field " + field
- + " in document is not stored and cannot be analyzed");
- }
- return getTokenStream(field, contents, analyzer);
- }
-
- // convenience method
- public static TokenStream getTokenStream(String field, String contents,
- Analyzer analyzer) {
- return analyzer.tokenStream(field, new StringReader(contents));
- }
-
-}
Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Fragmenter.java
===================================================================
--- lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Fragmenter.java (revision 956773)
+++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Fragmenter.java (working copy)
@@ -1,46 +0,0 @@
-package org.apache.lucene.search.highlight;
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import org.apache.lucene.analysis.TokenStream;
-
-/**
- * Implements the policy for breaking text into multiple fragments for
- * consideration by the {@link Highlighter} class. A sophisticated
- * implementation may do this on the basis of detecting end of sentences in the
- * text.
- */
-public interface Fragmenter {
-
- /**
- * Initializes the Fragmenter. You can grab references to the Attributes you are
- * interested in from tokenStream and then access the values in {@link #isNewFragment()}.
- *
- * @param originalText the original source text
- * @param tokenStream the {@link TokenStream} to be fragmented
- */
- public void start(String originalText, TokenStream tokenStream);
-
-
- /**
- * Test to see if this token from the stream should be held in a new
- * TextFragment. Every time this is called, the TokenStream
- * passed to start(String, TokenStream) will have been incremented.
- *
- */
- public boolean isNewFragment();
-}
Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/InvalidTokenOffsetsException.java
===================================================================
--- lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/InvalidTokenOffsetsException.java (revision 956773)
+++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/InvalidTokenOffsetsException.java (working copy)
@@ -1,31 +0,0 @@
-package org.apache.lucene.search.highlight;
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * Exception thrown if TokenStream Tokens are incompatible with provided text
- *
- */
-public class InvalidTokenOffsetsException extends Exception
-{
-
- public InvalidTokenOffsetsException(String message)
- {
- super(message);
- }
-
-}
Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/GradientFormatter.java
===================================================================
--- lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/GradientFormatter.java (revision 956773)
+++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/GradientFormatter.java (working copy)
@@ -1,227 +0,0 @@
-package org.apache.lucene.search.highlight;
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * Formats text with different color intensity depending on the score of the
- * term.
- *
- */
-public class GradientFormatter implements Formatter
-{
- private float maxScore;
-
- int fgRMin, fgGMin, fgBMin;
-
- int fgRMax, fgGMax, fgBMax;
-
- protected boolean highlightForeground;
-
- int bgRMin, bgGMin, bgBMin;
-
- int bgRMax, bgGMax, bgBMax;
-
- protected boolean highlightBackground;
-
- /**
- * Sets the color range for the IDF scores
- *
- * @param maxScore
- * The score (and above) displayed as maxColor (See QueryScorer.getMaxWeight
- * which can be used to calibrate scoring scale)
- * @param minForegroundColor
- * The hex color used for representing IDF scores of zero eg
- * #FFFFFF (white) or null if no foreground color required
- * @param maxForegroundColor
- * The largest hex color used for representing IDF scores eg
- * #000000 (black) or null if no foreground color required
- * @param minBackgroundColor
- * The hex color used for representing IDF scores of zero eg
- * #FFFFFF (white) or null if no background color required
- * @param maxBackgroundColor
- * The largest hex color used for representing IDF scores eg
- * #000000 (black) or null if no background color required
- */
- public GradientFormatter(float maxScore, String minForegroundColor,
- String maxForegroundColor, String minBackgroundColor,
- String maxBackgroundColor)
- {
- highlightForeground = (minForegroundColor != null)
- && (maxForegroundColor != null);
- if (highlightForeground)
- {
- if (minForegroundColor.length() != 7)
- {
- throw new IllegalArgumentException(
- "minForegroundColor is not 7 bytes long eg a hex "
- + "RGB value such as #FFFFFF");
- }
- if (maxForegroundColor.length() != 7)
- {
- throw new IllegalArgumentException(
- "minForegroundColor is not 7 bytes long eg a hex "
- + "RGB value such as #FFFFFF");
- }
- fgRMin = hexToInt(minForegroundColor.substring(1, 3));
- fgGMin = hexToInt(minForegroundColor.substring(3, 5));
- fgBMin = hexToInt(minForegroundColor.substring(5, 7));
-
- fgRMax = hexToInt(maxForegroundColor.substring(1, 3));
- fgGMax = hexToInt(maxForegroundColor.substring(3, 5));
- fgBMax = hexToInt(maxForegroundColor.substring(5, 7));
- }
-
- highlightBackground = (minBackgroundColor != null)
- && (maxBackgroundColor != null);
- if (highlightBackground)
- {
- if (minBackgroundColor.length() != 7)
- {
- throw new IllegalArgumentException(
- "minBackgroundColor is not 7 bytes long eg a hex "
- + "RGB value such as #FFFFFF");
- }
- if (maxBackgroundColor.length() != 7)
- {
- throw new IllegalArgumentException(
- "minBackgroundColor is not 7 bytes long eg a hex "
- + "RGB value such as #FFFFFF");
- }
- bgRMin = hexToInt(minBackgroundColor.substring(1, 3));
- bgGMin = hexToInt(minBackgroundColor.substring(3, 5));
- bgBMin = hexToInt(minBackgroundColor.substring(5, 7));
-
- bgRMax = hexToInt(maxBackgroundColor.substring(1, 3));
- bgGMax = hexToInt(maxBackgroundColor.substring(3, 5));
- bgBMax = hexToInt(maxBackgroundColor.substring(5, 7));
- }
- // this.corpusReader = corpusReader;
- this.maxScore = maxScore;
- // totalNumDocs = corpusReader.numDocs();
- }
-
- public String highlightTerm(String originalText, TokenGroup tokenGroup)
- {
- if (tokenGroup.getTotalScore() == 0)
- return originalText;
- float score = tokenGroup.getTotalScore();
- if (score == 0)
- {
- return originalText;
- }
- StringBuilder sb = new StringBuilder();
- sb.append("");
- sb.append(originalText);
- sb.append("");
- return sb.toString();
- }
-
- protected String getForegroundColorString(float score)
- {
- int rVal = getColorVal(fgRMin, fgRMax, score);
- int gVal = getColorVal(fgGMin, fgGMax, score);
- int bVal = getColorVal(fgBMin, fgBMax, score);
- StringBuilder sb = new StringBuilder();
- sb.append("#");
- sb.append(intToHex(rVal));
- sb.append(intToHex(gVal));
- sb.append(intToHex(bVal));
- return sb.toString();
- }
-
- protected String getBackgroundColorString(float score)
- {
- int rVal = getColorVal(bgRMin, bgRMax, score);
- int gVal = getColorVal(bgGMin, bgGMax, score);
- int bVal = getColorVal(bgBMin, bgBMax, score);
- StringBuilder sb = new StringBuilder();
- sb.append("#");
- sb.append(intToHex(rVal));
- sb.append(intToHex(gVal));
- sb.append(intToHex(bVal));
- return sb.toString();
- }
-
- private int getColorVal(int colorMin, int colorMax, float score)
- {
- if (colorMin == colorMax)
- {
- return colorMin;
- }
- float scale = Math.abs(colorMin - colorMax);
- float relScorePercent = Math.min(maxScore, score) / maxScore;
- float colScore = scale * relScorePercent;
- return Math.min(colorMin, colorMax) + (int) colScore;
- }
-
- private static char hexDigits[] = { '0', '1', '2', '3', '4', '5', '6', '7',
- '8', '9', 'A', 'B', 'C', 'D', 'E', 'F' };
-
- private static String intToHex(int i)
- {
- return "" + hexDigits[(i & 0xF0) >> 4] + hexDigits[i & 0x0F];
- }
-
- /**
- * Converts a hex string into an int. Integer.parseInt(hex, 16) assumes the
- * input is nonnegative unless there is a preceding minus sign. This method
- * reads the input as twos complement instead, so if the input is 8 bytes
- * long, it will correctly restore a negative int produced by
- * Integer.toHexString() but not necessarily one produced by
- * Integer.toString(x,16) since that method will produce a string like '-FF'
- * for negative integer values.
- *
- * @param hex
- * A string in capital or lower case hex, of no more then 16
- * characters.
- * @throws NumberFormatException
- * if the string is more than 16 characters long, or if any
- * character is not in the set [0-9a-fA-f]
- */
- public static final int hexToInt(String hex)
- {
- int len = hex.length();
- if (len > 16)
- throw new NumberFormatException();
-
- int l = 0;
- for (int i = 0; i < len; i++)
- {
- l <<= 4;
- int c = Character.digit(hex.charAt(i), 16);
- if (c < 0)
- throw new NumberFormatException();
- l |= c;
- }
- return l;
- }
-
-}
-
Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenStreamFromTermPositionVector.java
===================================================================
--- lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenStreamFromTermPositionVector.java (revision 956773)
+++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenStreamFromTermPositionVector.java (working copy)
@@ -1,116 +0,0 @@
-package org.apache.lucene.search.highlight;
-
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.Comparator;
-import java.util.Iterator;
-import java.util.List;
-
-import org.apache.lucene.analysis.Token;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
-import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
-import org.apache.lucene.index.TermPositionVector;
-import org.apache.lucene.index.TermVectorOffsetInfo;
-
-public final class TokenStreamFromTermPositionVector extends TokenStream {
-
- private final List positionedTokens = new ArrayList();
-
- private Iterator tokensAtCurrentPosition;
-
- private CharTermAttribute termAttribute;
-
- private PositionIncrementAttribute positionIncrementAttribute;
-
- private OffsetAttribute offsetAttribute;
-
- /**
- * Constructor.
- *
- * @param termPositionVector TermPositionVector that contains the data for
- * creating the TokenStream. Must have positions and offsets.
- */
- public TokenStreamFromTermPositionVector(
- final TermPositionVector termPositionVector) {
- termAttribute = addAttribute(CharTermAttribute.class);
- positionIncrementAttribute = addAttribute(PositionIncrementAttribute.class);
- offsetAttribute = addAttribute(OffsetAttribute.class);
- final String[] terms = termPositionVector.getTerms();
- for (int i = 0; i < terms.length; i++) {
- final TermVectorOffsetInfo[] offsets = termPositionVector.getOffsets(i);
- final int[] termPositions = termPositionVector.getTermPositions(i);
- for (int j = 0; j < termPositions.length; j++) {
- Token token;
- if (offsets != null) {
- token = new Token(terms[i].toCharArray(), 0, terms[i].length(),
- offsets[j].getStartOffset(), offsets[j].getEndOffset());
- } else {
- token = new Token();
- token.setEmpty().append(terms[i]);
- }
- // Yes - this is the position, not the increment! This is for
- // sorting. This value
- // will be corrected before use.
- token.setPositionIncrement(termPositions[j]);
- this.positionedTokens.add(token);
- }
- }
- final Comparator tokenComparator = new Comparator() {
- public int compare(final Token o1, final Token o2) {
- if (o1.getPositionIncrement() < o2.getPositionIncrement()) {
- return -1;
- }
- if (o1.getPositionIncrement() > o2.getPositionIncrement()) {
- return 1;
- }
- return 0;
- }
- };
- Collections.sort(this.positionedTokens, tokenComparator);
- int lastPosition = -1;
- for (final Token token : this.positionedTokens) {
- int thisPosition = token.getPositionIncrement();
- token.setPositionIncrement(thisPosition - lastPosition);
- lastPosition = thisPosition;
- }
- this.tokensAtCurrentPosition = this.positionedTokens.iterator();
- }
-
- @Override
- public boolean incrementToken() throws IOException {
- if (this.tokensAtCurrentPosition.hasNext()) {
- final Token next = this.tokensAtCurrentPosition.next();
- clearAttributes();
- termAttribute.setEmpty().append(next);
- positionIncrementAttribute.setPositionIncrement(next
- .getPositionIncrement());
- offsetAttribute.setOffset(next.startOffset(), next.endOffset());
- return true;
- }
- return false;
- }
-
- @Override
- public void reset() throws IOException {
- this.tokensAtCurrentPosition = this.positionedTokens.iterator();
- }
-}
Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Encoder.java
===================================================================
--- lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Encoder.java (revision 956773)
+++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Encoder.java (working copy)
@@ -1,29 +0,0 @@
-package org.apache.lucene.search.highlight;
-/**
- * Copyright 2005 The Apache Software Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-
-/**
- * Encodes original text. The Encoder works with the {@link Formatter} to generate output.
- *
- */
-public interface Encoder
-{
- /**
- * @param originalText The section of text being output
- */
- String encodeText(String originalText);
-}
\ No newline at end of file
Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTerm.java
===================================================================
--- lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTerm.java (revision 956773)
+++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTerm.java (working copy)
@@ -1,104 +0,0 @@
-package org.apache.lucene.search.highlight;
-
-
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-import java.util.ArrayList;
-import java.util.Iterator;
-import java.util.List;
-
-
-/**
- * Lightweight class to hold term, weight, and positions used for scoring this
- * term.
- */
-public class WeightedSpanTerm extends WeightedTerm{
- boolean positionSensitive;
- private List positionSpans = new ArrayList();
-
- /**
- * @param weight
- * @param term
- */
- public WeightedSpanTerm(float weight, String term) {
- super(weight, term);
- this.positionSpans = new ArrayList();
- }
-
- /**
- * @param weight
- * @param term
- * @param positionSensitive
- */
- public WeightedSpanTerm(float weight, String term, boolean positionSensitive) {
- super(weight, term);
- this.positionSensitive = positionSensitive;
- }
-
- /**
- * Checks to see if this term is valid at position.
- *
- * @param position
- * to check against valid term positions
- * @return true iff this term is a hit at this position
- */
- public boolean checkPosition(int position) {
- // There would probably be a slight speed improvement if PositionSpans
- // where kept in some sort of priority queue - that way this method
- // could
- // bail early without checking each PositionSpan.
- Iterator positionSpanIt = positionSpans.iterator();
-
- while (positionSpanIt.hasNext()) {
- PositionSpan posSpan = positionSpanIt.next();
-
- if (((position >= posSpan.start) && (position <= posSpan.end))) {
- return true;
- }
- }
-
- return false;
- }
-
- public void addPositionSpans(List positionSpans) {
- this.positionSpans.addAll(positionSpans);
- }
-
- public boolean isPositionSensitive() {
- return positionSensitive;
- }
-
- public void setPositionSensitive(boolean positionSensitive) {
- this.positionSensitive = positionSensitive;
- }
-
- public List getPositionSpans() {
- return positionSpans;
- }
-}
-
-
-// Utility class to store a Span
-class PositionSpan {
- int start;
- int end;
-
- public PositionSpan(int start, int end) {
- this.start = start;
- this.end = end;
- }
-}
Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/QueryTermExtractor.java
===================================================================
--- lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/QueryTermExtractor.java (revision 956773)
+++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/QueryTermExtractor.java (working copy)
@@ -1,172 +0,0 @@
-package org.apache.lucene.search.highlight;
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.io.IOException;
-import java.util.HashSet;
-import java.util.Iterator;
-
-import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.Term;
-import org.apache.lucene.search.BooleanClause;
-import org.apache.lucene.search.BooleanQuery;
-import org.apache.lucene.search.FilteredQuery;
-import org.apache.lucene.search.Query;
-import org.apache.lucene.util.StringHelper;
-
-/**
- * Utility class used to extract the terms used in a query, plus any weights.
- * This class will not find terms for MultiTermQuery, TermRangeQuery and PrefixQuery classes
- * so the caller must pass a rewritten query (see Query.rewrite) to obtain a list of
- * expanded terms.
- *
- */
-public final class QueryTermExtractor
-{
-
- /**
- * Extracts all terms texts of a given Query into an array of WeightedTerms
- *
- * @param query Query to extract term texts from
- * @return an array of the terms used in a query, plus their weights.
- */
- public static final WeightedTerm[] getTerms(Query query)
- {
- return getTerms(query,false);
- }
-
- /**
- * Extracts all terms texts of a given Query into an array of WeightedTerms
- *
- * @param query Query to extract term texts from
- * @param reader used to compute IDF which can be used to a) score selected fragments better
- * b) use graded highlights eg changing intensity of font color
- * @param fieldName the field on which Inverse Document Frequency (IDF) calculations are based
- * @return an array of the terms used in a query, plus their weights.
- */
- public static final WeightedTerm[] getIdfWeightedTerms(Query query, IndexReader reader, String fieldName)
- {
- WeightedTerm[] terms=getTerms(query,false, fieldName);
- int totalNumDocs=reader.numDocs();
- for (int i = 0; i < terms.length; i++)
- {
- try
- {
- int docFreq=reader.docFreq(new Term(fieldName,terms[i].term));
- // docFreq counts deletes
- if(totalNumDocs < docFreq) {
- docFreq = totalNumDocs;
- }
- //IDF algorithm taken from DefaultSimilarity class
- float idf=(float)(Math.log((float)totalNumDocs/(double)(docFreq+1)) + 1.0);
- terms[i].weight*=idf;
- }
- catch (IOException e)
- {
- //ignore
- }
- }
- return terms;
- }
-
- /**
- * Extracts all terms texts of a given Query into an array of WeightedTerms
- *
- * @param query Query to extract term texts from
- * @param prohibited true to extract "prohibited" terms, too
- * @param fieldName The fieldName used to filter query terms
- * @return an array of the terms used in a query, plus their weights.
- */
- public static final WeightedTerm[] getTerms(Query query, boolean prohibited, String fieldName)
- {
- HashSet terms=new HashSet();
- if(fieldName!=null)
- {
- fieldName= StringHelper.intern(fieldName);
- }
- getTerms(query,terms,prohibited,fieldName);
- return terms.toArray(new WeightedTerm[0]);
- }
-
- /**
- * Extracts all terms texts of a given Query into an array of WeightedTerms
- *
- * @param query Query to extract term texts from
- * @param prohibited true to extract "prohibited" terms, too
- * @return an array of the terms used in a query, plus their weights.
- */
- public static final WeightedTerm[] getTerms(Query query, boolean prohibited)
- {
- return getTerms(query,prohibited,null);
- }
-
- //fieldname MUST be interned prior to this call
- private static final void getTerms(Query query, HashSet terms,boolean prohibited, String fieldName)
- {
- try
- {
- if (query instanceof BooleanQuery)
- getTermsFromBooleanQuery((BooleanQuery) query, terms, prohibited, fieldName);
- else
- if(query instanceof FilteredQuery)
- getTermsFromFilteredQuery((FilteredQuery)query, terms,prohibited, fieldName);
- else
- {
- HashSet nonWeightedTerms=new HashSet();
- query.extractTerms(nonWeightedTerms);
- for (Iterator iter = nonWeightedTerms.iterator(); iter.hasNext();)
- {
- Term term = iter.next();
- if((fieldName==null)||(term.field()==fieldName))
- {
- terms.add(new WeightedTerm(query.getBoost(),term.text()));
- }
- }
- }
- }
- catch(UnsupportedOperationException ignore)
- {
- //this is non-fatal for our purposes
- }
- }
-
- /**
- * extractTerms is currently the only query-independent means of introspecting queries but it only reveals
- * a list of terms for that query - not the boosts each individual term in that query may or may not have.
- * "Container" queries such as BooleanQuery should be unwrapped to get at the boost info held
- * in each child element.
- * Some discussion around this topic here:
- * http://www.gossamer-threads.com/lists/lucene/java-dev/34208?search_string=introspection;#34208
- * Unfortunately there seemed to be limited interest in requiring all Query objects to implement
- * something common which would allow access to child queries so what follows here are query-specific
- * implementations for accessing embedded query elements.
- */
- private static final void getTermsFromBooleanQuery(BooleanQuery query, HashSet terms, boolean prohibited, String fieldName)
- {
- BooleanClause[] queryClauses = query.getClauses();
- for (int i = 0; i < queryClauses.length; i++)
- {
- if (prohibited || queryClauses[i].getOccur()!=BooleanClause.Occur.MUST_NOT)
- getTerms(queryClauses[i].getQuery(), terms, prohibited, fieldName);
- }
- }
- private static void getTermsFromFilteredQuery(FilteredQuery query, HashSet terms, boolean prohibited, String fieldName)
- {
- getTerms(query.getQuery(),terms,prohibited,fieldName);
- }
-
-}
Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TextFragment.java
===================================================================
--- lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TextFragment.java (revision 956773)
+++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TextFragment.java (working copy)
@@ -1,91 +0,0 @@
-package org.apache.lucene.search.highlight;
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-
-/**
- * Low-level class used to record information about a section of a document
- * with a score.
- *
- *
- */
-public class TextFragment
-{
- CharSequence markedUpText;
- int fragNum;
- int textStartPos;
- int textEndPos;
- float score;
-
- public TextFragment(CharSequence markedUpText,int textStartPos, int fragNum)
- {
- this.markedUpText=markedUpText;
- this.textStartPos = textStartPos;
- this.fragNum = fragNum;
- }
- /**
- * @deprecated Use {@link #TextFragment(CharSequence, int, int)} instead.
- * This constructor will be removed in Lucene 4.0
- */
- @Deprecated
- public TextFragment(StringBuffer markedUpText,int textStartPos, int fragNum)
- {
- this.markedUpText=markedUpText;
- this.textStartPos = textStartPos;
- this.fragNum = fragNum;
- }
- void setScore(float score)
- {
- this.score=score;
- }
- public float getScore()
- {
- return score;
- }
- /**
- * @param frag2 Fragment to be merged into this one
- */
- public void merge(TextFragment frag2)
- {
- textEndPos = frag2.textEndPos;
- score=Math.max(score,frag2.score);
- }
- /**
- * @param fragment
- * @return true if this fragment follows the one passed
- */
- public boolean follows(TextFragment fragment)
- {
- return textStartPos == fragment.textEndPos;
- }
-
- /**
- * @return the fragment sequence number
- */
- public int getFragNum()
- {
- return fragNum;
- }
-
- /* Returns the marked-up text for this text fragment
- */
- @Override
- public String toString() {
- return markedUpText.subSequence(textStartPos, textEndPos).toString();
- }
-
-}
Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/package.html
===================================================================
--- lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/package.html (revision 956773)
+++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/package.html (working copy)
@@ -1,99 +0,0 @@
-
-
-
-
-
-The highlight package contains classes to provide "keyword in context" features
-typically used to highlight search terms in the text of results pages.
-The Highlighter class is the central component and can be used to extract the
-most interesting sections of a piece of text and highlight them, with the help of
-Fragmenter, fragment Scorer, and Formatter classes.
-
-Example Usage
-
-
- //... Above, create documents with two fields, one with term vectors (tv) and one without (notv)
- IndexSearcher searcher = new IndexSearcher(directory);
- QueryParser parser = new QueryParser("notv", analyzer);
- Query query = parser.parse("million");
-
- TopDocs hits = searcher.search(query, 10);
-
- SimpleHTMLFormatter htmlFormatter = new SimpleHTMLFormatter();
- Highlighter highlighter = new Highlighter(htmlFormatter, new QueryScorer(query));
- for (int i = 0; i < 10; i++) {
- int id = hits.scoreDocs[i].doc;
- Document doc = searcher.doc(id);
- String text = doc.get("notv");
- TokenStream tokenStream = TokenSources.getAnyTokenStream(searcher.getIndexReader(), id, "notv", analyzer);
- TextFragment[] frag = highlighter.getBestTextFragments(tokenStream, text, false, 10);//highlighter.getBestFragments(tokenStream, text, 3, "...");
- for (int j = 0; j < frag.length; j++) {
- if ((frag[j] != null) && (frag[j].getScore() > 0)) {
- System.out.println((frag[j].toString()));
- }
- }
- //Term vector
- text = doc.get("tv");
- tokenStream = TokenSources.getAnyTokenStream(searcher.getIndexReader(), hits.scoreDocs[i].doc, "tv", analyzer);
- frag = highlighter.getBestTextFragments(tokenStream, text, false, 10);
- for (int j = 0; j < frag.length; j++) {
- if ((frag[j] != null) && (frag[j].getScore() > 0)) {
- System.out.println((frag[j].toString()));
- }
- }
- System.out.println("-------------");
- }
-
-
-New features 06/02/2005
-
-This release adds options for encoding (thanks to Nicko Cadell).
-An "Encoder" implementation such as the new SimpleHTMLEncoder class can be passed to the highlighter to encode
-all those non-xhtml standard characters such as & into legal values. This simple class may not suffice for
-some languages - Commons Lang has an implementation that could be used: escapeHtml(String) in
-http://svn.apache.org/viewcvs.cgi/jakarta/commons/proper/lang/trunk/src/java/org/apache/commons/lang/StringEscapeUtils.java?rev=137958&view=markup
-
-New features 22/12/2004
-
-This release adds some new capabilities:
-
- - Faster highlighting using Term vector support
- - New formatting options to use color intensity to show informational value
- - Options for better summarization by using term IDF scores to influence fragment selection
-
-
-
-The highlighter takes a TokenStream as input. Until now these streams have typically been produced
-using an Analyzer but the new class TokenSources provides helper methods for obtaining TokenStreams from
-the new TermVector position support (see latest CVS version).
-
-The new class GradientFormatter can use a scale of colors to highlight terms according to their score.
-A subtle use of color can help emphasise the reasons for matching (useful when doing "MoreLikeThis" queries and
-you want to see what the basis of the similarities are).
-
-The QueryScorer class has a new constructor which can use an IndexReader to derive the IDF (inverse document frequency)
-for each term in order to influence the score. This is useful for helping to extracting the most significant sections
-of a document and in supplying scores used by the new GradientFormatter to color significant words more strongly.
-The QueryScorer.getMaxWeight method is useful when passed to the GradientFormatter constructor to define the top score
-which is associated with the top color.
-
-
-
-
-
-
\ No newline at end of file
Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/QueryScorer.java
===================================================================
--- lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/QueryScorer.java (revision 956773)
+++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/QueryScorer.java (working copy)
@@ -1,268 +0,0 @@
-package org.apache.lucene.search.highlight;
-
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.io.IOException;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.Map;
-import java.util.Set;
-
-import org.apache.lucene.analysis.CachingTokenFilter;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
-import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.memory.MemoryIndex;
-import org.apache.lucene.search.Query;
-import org.apache.lucene.search.spans.SpanQuery;
-import org.apache.lucene.util.StringHelper;
-
-/**
- * {@link Scorer} implementation which scores text fragments by the number of
- * unique query terms found. This class converts appropriate {@link Query}s to
- * {@link SpanQuery}s and attempts to score only those terms that participated in
- * generating the 'hit' on the document.
- */
-public class QueryScorer implements Scorer {
- private float totalScore;
- private Set foundTerms;
- private Map fieldWeightedSpanTerms;
- private float maxTermWeight;
- private int position = -1;
- private String defaultField;
- private CharTermAttribute termAtt;
- private PositionIncrementAttribute posIncAtt;
- private boolean expandMultiTermQuery = true;
- private Query query;
- private String field;
- private IndexReader reader;
- private boolean skipInitExtractor;
- private boolean wrapToCaching = true;
-
- /**
- * @param query Query to use for highlighting
- */
- public QueryScorer(Query query) {
- init(query, null, null, true);
- }
-
- /**
- * @param query Query to use for highlighting
- * @param field Field to highlight - pass null to ignore fields
- */
- public QueryScorer(Query query, String field) {
- init(query, field, null, true);
- }
-
- /**
- * @param query Query to use for highlighting
- * @param field Field to highlight - pass null to ignore fields
- * @param reader {@link IndexReader} to use for quasi tf/idf scoring
- */
- public QueryScorer(Query query, IndexReader reader, String field) {
- init(query, field, reader, true);
- }
-
-
- /**
- * @param query to use for highlighting
- * @param reader {@link IndexReader} to use for quasi tf/idf scoring
- * @param field to highlight - pass null to ignore fields
- * @param defaultField
- */
- public QueryScorer(Query query, IndexReader reader, String field, String defaultField) {
- this.defaultField = StringHelper.intern(defaultField);
- init(query, field, reader, true);
- }
-
- /**
- * @param defaultField - The default field for queries with the field name unspecified
- */
- public QueryScorer(Query query, String field, String defaultField) {
- this.defaultField = StringHelper.intern(defaultField);
- init(query, field, null, true);
- }
-
- /**
- * @param weightedTerms an array of pre-created {@link WeightedSpanTerm}s
- */
- public QueryScorer(WeightedSpanTerm[] weightedTerms) {
- this.fieldWeightedSpanTerms = new HashMap(weightedTerms.length);
-
- for (int i = 0; i < weightedTerms.length; i++) {
- WeightedSpanTerm existingTerm = fieldWeightedSpanTerms.get(weightedTerms[i].term);
-
- if ((existingTerm == null) ||
- (existingTerm.weight < weightedTerms[i].weight)) {
- // if a term is defined more than once, always use the highest
- // scoring weight
- fieldWeightedSpanTerms.put(weightedTerms[i].term, weightedTerms[i]);
- maxTermWeight = Math.max(maxTermWeight, weightedTerms[i].getWeight());
- }
- }
- skipInitExtractor = true;
- }
-
- /*
- * (non-Javadoc)
- *
- * @see org.apache.lucene.search.highlight.Scorer#getFragmentScore()
- */
- public float getFragmentScore() {
- return totalScore;
- }
-
- /**
- *
- * @return The highest weighted term (useful for passing to
- * GradientFormatter to set top end of coloring scale).
- */
- public float getMaxTermWeight() {
- return maxTermWeight;
- }
-
- /*
- * (non-Javadoc)
- *
- * @see org.apache.lucene.search.highlight.Scorer#getTokenScore(org.apache.lucene.analysis.Token,
- * int)
- */
- public float getTokenScore() {
- position += posIncAtt.getPositionIncrement();
- String termText = termAtt.toString();
-
- WeightedSpanTerm weightedSpanTerm;
-
- if ((weightedSpanTerm = fieldWeightedSpanTerms.get(
- termText)) == null) {
- return 0;
- }
-
- if (weightedSpanTerm.positionSensitive &&
- !weightedSpanTerm.checkPosition(position)) {
- return 0;
- }
-
- float score = weightedSpanTerm.getWeight();
-
- // found a query term - is it unique in this doc?
- if (!foundTerms.contains(termText)) {
- totalScore += score;
- foundTerms.add(termText);
- }
-
- return score;
- }
-
- /* (non-Javadoc)
- * @see org.apache.lucene.search.highlight.Scorer#init(org.apache.lucene.analysis.TokenStream)
- */
- public TokenStream init(TokenStream tokenStream) throws IOException {
- position = -1;
- termAtt = tokenStream.addAttribute(CharTermAttribute.class);
- posIncAtt = tokenStream.addAttribute(PositionIncrementAttribute.class);
- if(!skipInitExtractor) {
- if(fieldWeightedSpanTerms != null) {
- fieldWeightedSpanTerms.clear();
- }
- return initExtractor(tokenStream);
- }
- return null;
- }
-
- /**
- * Retrieve the {@link WeightedSpanTerm} for the specified token. Useful for passing
- * Span information to a {@link Fragmenter}.
- *
- * @param token to get {@link WeightedSpanTerm} for
- * @return WeightedSpanTerm for token
- */
- public WeightedSpanTerm getWeightedSpanTerm(String token) {
- return fieldWeightedSpanTerms.get(token);
- }
-
- /**
- */
- private void init(Query query, String field, IndexReader reader, boolean expandMultiTermQuery) {
- this.reader = reader;
- this.expandMultiTermQuery = expandMultiTermQuery;
- this.query = query;
- this.field = field;
- }
-
- private TokenStream initExtractor(TokenStream tokenStream) throws IOException {
- WeightedSpanTermExtractor qse = defaultField == null ? new WeightedSpanTermExtractor()
- : new WeightedSpanTermExtractor(defaultField);
-
- qse.setExpandMultiTermQuery(expandMultiTermQuery);
- qse.setWrapIfNotCachingTokenFilter(wrapToCaching);
- if (reader == null) {
- this.fieldWeightedSpanTerms = qse.getWeightedSpanTerms(query,
- tokenStream, field);
- } else {
- this.fieldWeightedSpanTerms = qse.getWeightedSpanTermsWithScores(query,
- tokenStream, field, reader);
- }
- if(qse.isCachedTokenStream()) {
- return qse.getTokenStream();
- }
-
- return null;
- }
-
- /*
- * (non-Javadoc)
- *
- * @see org.apache.lucene.search.highlight.Scorer#startFragment(org.apache.lucene.search.highlight.TextFragment)
- */
- public void startFragment(TextFragment newFragment) {
- foundTerms = new HashSet();
- totalScore = 0;
- }
-
- /**
- * @return true if multi-term queries should be expanded
- */
- public boolean isExpandMultiTermQuery() {
- return expandMultiTermQuery;
- }
-
- /**
- * Controls whether or not multi-term queries are expanded
- * against a {@link MemoryIndex} {@link IndexReader}.
- *
- * @param expandMultiTermQuery true if multi-term queries should be expanded
- */
- public void setExpandMultiTermQuery(boolean expandMultiTermQuery) {
- this.expandMultiTermQuery = expandMultiTermQuery;
- }
-
- /**
- * By default, {@link TokenStream}s that are not of the type
- * {@link CachingTokenFilter} are wrapped in a {@link CachingTokenFilter} to
- * ensure an efficient reset - if you are already using a different caching
- * {@link TokenStream} impl and you don't want it to be wrapped, set this to
- * false.
- *
- * @param wrap
- */
- public void setWrapIfNotCachingTokenFilter(boolean wrap) {
- this.wrapToCaching = wrap;
- }
-}
Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/SimpleHTMLEncoder.java
===================================================================
--- lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/SimpleHTMLEncoder.java (revision 956773)
+++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/SimpleHTMLEncoder.java (working copy)
@@ -1,81 +0,0 @@
-package org.apache.lucene.search.highlight;
-/**
- * Copyright 2005 The Apache Software Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * Simple {@link Encoder} implementation to escape text for HTML output
- *
- */
-public class SimpleHTMLEncoder implements Encoder
-{
- public SimpleHTMLEncoder()
- {
- }
-
- public String encodeText(String originalText)
- {
- return htmlEncode(originalText);
- }
-
- /**
- * Encode string into HTML
- */
- public final static String htmlEncode(String plainText)
- {
- if (plainText == null || plainText.length() == 0)
- {
- return "";
- }
-
- StringBuilder result = new StringBuilder(plainText.length());
-
- for (int index=0; index':
- result.append(">");
- break;
-
- default:
- if (ch < 128)
- {
- result.append(ch);
- }
- else
- {
- result.append("").append((int)ch).append(";");
- }
- }
- }
-
- return result.toString();
- }
-}
\ No newline at end of file
Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/DefaultEncoder.java
===================================================================
--- lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/DefaultEncoder.java (revision 956773)
+++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/DefaultEncoder.java (working copy)
@@ -1,32 +0,0 @@
-package org.apache.lucene.search.highlight;
-/**
- * Copyright 2005 The Apache Software Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * Simple {@link Encoder} implementation that does not modify the output
- *
- */
-public class DefaultEncoder implements Encoder
-{
- public DefaultEncoder()
- {
- }
-
- public String encodeText(String originalText)
- {
- return originalText;
- }
-}
\ No newline at end of file
Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/WeightedTerm.java
===================================================================
--- lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/WeightedTerm.java (revision 956773)
+++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/WeightedTerm.java (working copy)
@@ -1,64 +0,0 @@
-package org.apache.lucene.search.highlight;
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/** Lightweight class to hold term and a weight value used for scoring this term
- */
-public class WeightedTerm
-{
- float weight; // multiplier
- String term; //stemmed form
- public WeightedTerm (float weight,String term)
- {
- this.weight=weight;
- this.term=term;
- }
-
-
- /**
- * @return the term value (stemmed)
- */
- public String getTerm()
- {
- return term;
- }
-
- /**
- * @return the weight associated with this term
- */
- public float getWeight()
- {
- return weight;
- }
-
- /**
- * @param term the term value (stemmed)
- */
- public void setTerm(String term)
- {
- this.term = term;
- }
-
- /**
- * @param weight the weight associated with this term
- */
- public void setWeight(float weight)
- {
- this.weight = weight;
- }
-
-}
Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Scorer.java
===================================================================
--- lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Scorer.java (revision 956773)
+++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Scorer.java (working copy)
@@ -1,67 +0,0 @@
-package org.apache.lucene.search.highlight;
-
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.io.IOException;
-
-import org.apache.lucene.analysis.TokenStream;
-
-/**
- * A Scorer is responsible for scoring a stream of tokens. These token scores
- * can then be used to compute {@link TextFragment} scores.
- */
-public interface Scorer {
-
- /**
- * Called to init the Scorer with a {@link TokenStream}. You can grab references to
- * the attributes you are interested in here and access them from {@link #getTokenScore()}.
- *
- * @param tokenStream the {@link TokenStream} that will be scored.
- * @return either a {@link TokenStream} that the Highlighter should continue using (eg
- * if you read the tokenSream in this method) or null to continue
- * using the same {@link TokenStream} that was passed in.
- * @throws IOException
- */
- public TokenStream init(TokenStream tokenStream) throws IOException;
-
- /**
- * Called when a new fragment is started for consideration.
- *
- * @param newFragment the fragment that will be scored next
- */
- public void startFragment(TextFragment newFragment);
-
- /**
- * Called for each token in the current fragment. The {@link Highlighter} will
- * increment the {@link TokenStream} passed to init on every call.
- *
- * @return a score which is passed to the {@link Highlighter} class to influence the
- * mark-up of the text (this return value is NOT used to score the
- * fragment)
- */
- public float getTokenScore();
-
- /**
- * Called when the {@link Highlighter} has no more tokens for the current fragment -
- * the Scorer returns the weighting it has derived for the most recent
- * fragment, typically based on the results of {@link #getTokenScore()}.
- *
- */
- public float getFragmentScore();
-
-}
Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/termvector/BaseFragmentsBuilder.java
===================================================================
--- lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/termvector/BaseFragmentsBuilder.java (revision 0)
+++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/termvector/BaseFragmentsBuilder.java (revision 0)
@@ -0,0 +1,140 @@
+package org.apache.lucene.search.highlight.termvector;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.MapFieldSelector;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.search.highlight.formatting.Formatter;
+import org.apache.lucene.search.highlight.formatting.HTMLTagFormatter;
+import org.apache.lucene.search.highlight.termvector.FieldFragList.WeightedFragInfo;
+import org.apache.lucene.search.highlight.termvector.FieldFragList.WeightedFragInfo.SubInfo;
+import org.apache.lucene.search.highlight.termvector.FieldPhraseList.WeightedPhraseInfo.Toffs;
+
+public abstract class BaseFragmentsBuilder implements FragmentsBuilder {
+ private final Formatter formatter;
+
+ protected BaseFragmentsBuilder(){
+ this(new HTMLTagFormatter("", ""));
+ }
+
+ protected BaseFragmentsBuilder(Formatter formatter){
+ this.formatter = formatter;
+ }
+
+ static Object checkTagsArgument( Object tags ){
+ if( tags instanceof String ) return tags;
+ else if( tags instanceof String[] ) return tags;
+ throw new IllegalArgumentException( "type of preTags/postTags must be a String or String[]" );
+ }
+
+ public abstract List getWeightedFragInfoList( List src );
+
+ public String createFragment( IndexReader reader, int docId,
+ String fieldName, FieldFragList fieldFragList ) throws IOException {
+ String[] fragments = createFragments( reader, docId, fieldName, fieldFragList, 1 );
+ if( fragments == null || fragments.length == 0 ) return null;
+ return fragments[0];
+ }
+
+ public String[] createFragments( IndexReader reader, int docId,
+ String fieldName, FieldFragList fieldFragList, int maxNumFragments )
+ throws IOException {
+ if( maxNumFragments < 0 )
+ throw new IllegalArgumentException( "maxNumFragments(" + maxNumFragments + ") must be positive number." );
+
+ List fragInfos = getWeightedFragInfoList( fieldFragList.fragInfos );
+
+ List fragments = new ArrayList( maxNumFragments );
+ Field[] values = getFields( reader, docId, fieldName );
+ if( values.length == 0 ) return null;
+ StringBuilder buffer = new StringBuilder();
+ int[] nextValueIndex = { 0 };
+ for( int n = 0; n < maxNumFragments && n < fragInfos.size(); n++ ){
+ WeightedFragInfo fragInfo = fragInfos.get( n );
+ fragments.add( makeFragment( buffer, nextValueIndex, values, fragInfo ) );
+ }
+ return fragments.toArray( new String[fragments.size()] );
+ }
+
+ @Deprecated
+ protected String[] getFieldValues( IndexReader reader, int docId, String fieldName) throws IOException {
+ Document doc = reader.document( docId, new MapFieldSelector( new String[]{ fieldName } ) );
+ return doc.getValues( fieldName ); // according to Document class javadoc, this never returns null
+ }
+
+ protected Field[] getFields( IndexReader reader, int docId, String fieldName) throws IOException {
+ // according to javadoc, doc.getFields(fieldName) cannot be used with lazy loaded field???
+ Document doc = reader.document( docId, new MapFieldSelector( new String[]{ fieldName } ) );
+ return doc.getFields( fieldName ); // according to Document class javadoc, this never returns null
+ }
+
+ @Deprecated
+ protected String makeFragment( StringBuilder buffer, int[] index, String[] values, WeightedFragInfo fragInfo ){
+ final int s = fragInfo.startOffset;
+ return makeFragment( fragInfo, getFragmentSource( buffer, index, values, s, fragInfo.endOffset ), s );
+ }
+
+ protected String makeFragment( StringBuilder buffer, int[] index, Field[] values, WeightedFragInfo fragInfo ){
+ final int s = fragInfo.startOffset;
+ return makeFragment( fragInfo, getFragmentSource( buffer, index, values, s, fragInfo.endOffset ), s );
+ }
+
+ private String makeFragment( WeightedFragInfo fragInfo, String src, int s ){
+ StringBuilder fragment = new StringBuilder();
+ int srcIndex = 0;
+ for( SubInfo subInfo : fragInfo.subInfos ){
+ for( Toffs to : subInfo.termsOffsets ){
+ fragment.append( src.substring( srcIndex, to.startOffset - s ) ).append( formatter.getPreTag() )
+ .append( src.substring( to.startOffset - s, to.endOffset - s ) ).append( formatter.getPostTag() );
+ srcIndex = to.endOffset - s;
+ }
+ }
+ fragment.append( src.substring( srcIndex ) );
+ return fragment.toString();
+ }
+
+ @Deprecated
+ protected String getFragmentSource( StringBuilder buffer, int[] index, String[] values,
+ int startOffset, int endOffset ){
+ while( buffer.length() < endOffset && index[0] < values.length ){
+ if( index[0] > 0 && values[index[0]].length() > 0 )
+ buffer.append( ' ' );
+ buffer.append( values[index[0]++] );
+ }
+ int eo = buffer.length() < endOffset ? buffer.length() : endOffset;
+ return buffer.substring( startOffset, eo );
+ }
+
+ protected String getFragmentSource( StringBuilder buffer, int[] index, Field[] values,
+ int startOffset, int endOffset ){
+ while( buffer.length() < endOffset && index[0] < values.length ){
+ if( index[0] > 0 && values[index[0]].isTokenized() && values[index[0]].stringValue().length() > 0 )
+ buffer.append( ' ' );
+ buffer.append( values[index[0]++].stringValue() );
+ }
+ int eo = buffer.length() < endOffset ? buffer.length() : endOffset;
+ return buffer.substring( startOffset, eo );
+ }
+
+}
Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/termvector/FastVectorHighlighter.java
===================================================================
--- lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/termvector/FastVectorHighlighter.java (revision 0)
+++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/termvector/FastVectorHighlighter.java (revision 0)
@@ -0,0 +1,137 @@
+package org.apache.lucene.search.highlight.termvector;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.search.Query;
+
+/**
+ * Another highlighter implementation.
+ *
+ */
+public class FastVectorHighlighter {
+
+ public static final boolean DEFAULT_PHRASE_HIGHLIGHT = true;
+ public static final boolean DEFAULT_FIELD_MATCH = true;
+ private final boolean phraseHighlight;
+ private final boolean fieldMatch;
+ private final FragListBuilder fragListBuilder;
+ private final FragmentsBuilder fragmentsBuilder;
+
+ /**
+ * the default constructor.
+ */
+ public FastVectorHighlighter(){
+ this( DEFAULT_PHRASE_HIGHLIGHT, DEFAULT_FIELD_MATCH );
+ }
+
+ /**
+ * a constructor. Using SimpleFragListBuilder and ScoreOrderFragmentsBuilder.
+ *
+ * @param phraseHighlight true or false for phrase highlighting
+ * @param fieldMatch true of false for field matching
+ */
+ public FastVectorHighlighter( boolean phraseHighlight, boolean fieldMatch ){
+ this( phraseHighlight, fieldMatch, new SimpleFragListBuilder(), new ScoreOrderFragmentsBuilder() );
+ }
+
+ /**
+ * a constructor. A FragListBuilder and a FragmentsBuilder can be specified (plugins).
+ *
+ * @param phraseHighlight true of false for phrase highlighting
+ * @param fieldMatch true of false for field matching
+ * @param fragListBuilder an instance of FragListBuilder
+ * @param fragmentsBuilder an instance of FragmentsBuilder
+ */
+ public FastVectorHighlighter( boolean phraseHighlight, boolean fieldMatch,
+ FragListBuilder fragListBuilder, FragmentsBuilder fragmentsBuilder ){
+ this.phraseHighlight = phraseHighlight;
+ this.fieldMatch = fieldMatch;
+ this.fragListBuilder = fragListBuilder;
+ this.fragmentsBuilder = fragmentsBuilder;
+ }
+
+ /**
+ * create a FieldQuery object.
+ *
+ * @param query a query
+ * @return the created FieldQuery object
+ */
+ public FieldQuery getFieldQuery( Query query ){
+ return new FieldQuery( query, phraseHighlight, fieldMatch );
+ }
+
+ /**
+ * return the best fragment.
+ *
+ * @param fieldQuery FieldQuery object
+ * @param reader IndexReader of the index
+ * @param docId document id to be highlighted
+ * @param fieldName field of the document to be highlighted
+ * @param fragCharSize the length (number of chars) of a fragment
+ * @return the best fragment (snippet) string
+ * @throws IOException
+ */
+ public final String getBestFragment( final FieldQuery fieldQuery, IndexReader reader, int docId,
+ String fieldName, int fragCharSize ) throws IOException {
+ FieldFragList fieldFragList = getFieldFragList( fieldQuery, reader, docId, fieldName, fragCharSize );
+ return fragmentsBuilder.createFragment( reader, docId, fieldName, fieldFragList );
+ }
+
+ /**
+ * return the best fragments.
+ *
+ * @param fieldQuery FieldQuery object
+ * @param reader IndexReader of the index
+ * @param docId document id to be highlighted
+ * @param fieldName field of the document to be highlighted
+ * @param fragCharSize the length (number of chars) of a fragment
+ * @param maxNumFragments maximum number of fragments
+ * @return created fragments or null when no fragments created.
+ * size of the array can be less than maxNumFragments
+ * @throws IOException
+ */
+ public final String[] getBestFragments( final FieldQuery fieldQuery, IndexReader reader, int docId,
+ String fieldName, int fragCharSize, int maxNumFragments ) throws IOException {
+ FieldFragList fieldFragList = getFieldFragList( fieldQuery, reader, docId, fieldName, fragCharSize );
+ return fragmentsBuilder.createFragments( reader, docId, fieldName, fieldFragList, maxNumFragments );
+ }
+
+ private FieldFragList getFieldFragList( final FieldQuery fieldQuery, IndexReader reader, int docId,
+ String fieldName, int fragCharSize ) throws IOException {
+ FieldTermStack fieldTermStack = new FieldTermStack( reader, docId, fieldName, fieldQuery );
+ FieldPhraseList fieldPhraseList = new FieldPhraseList( fieldTermStack, fieldQuery );
+ return fragListBuilder.createFieldFragList( fieldPhraseList, fragCharSize );
+ }
+
+ /**
+ * return whether phraseHighlight or not.
+ *
+ * @return whether phraseHighlight or not
+ */
+ public boolean isPhraseHighlight(){ return phraseHighlight; }
+
+ /**
+ * return whether fieldMatch or not.
+ *
+ * @return whether fieldMatch or not
+ */
+ public boolean isFieldMatch(){ return fieldMatch; }
+}
Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/termvector/FieldFragList.java
===================================================================
--- lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/termvector/FieldFragList.java (revision 0)
+++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/termvector/FieldFragList.java (revision 0)
@@ -0,0 +1,128 @@
+package org.apache.lucene.search.highlight.termvector;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.lucene.search.highlight.termvector.FieldPhraseList.WeightedPhraseInfo;
+import org.apache.lucene.search.highlight.termvector.FieldPhraseList.WeightedPhraseInfo.Toffs;
+
+/**
+ * FieldFragList has a list of "frag info" that is used by FragmentsBuilder class
+ * to create fragments (snippets).
+ */
+public class FieldFragList {
+
+ List fragInfos = new ArrayList();
+
+ /**
+ * a constructor.
+ *
+ * @param fragCharSize the length (number of chars) of a fragment
+ */
+ public FieldFragList( int fragCharSize ){
+ }
+
+ /**
+ * convert the list of WeightedPhraseInfo to WeightedFragInfo, then add it to the fragInfos
+ *
+ * @param startOffset start offset of the fragment
+ * @param endOffset end offset of the fragment
+ * @param phraseInfoList list of WeightedPhraseInfo objects
+ */
+ public void add( int startOffset, int endOffset, List phraseInfoList ){
+ fragInfos.add( new WeightedFragInfo( startOffset, endOffset, phraseInfoList ) );
+ }
+
+ public static class WeightedFragInfo {
+
+ List subInfos;
+ float totalBoost;
+ int startOffset;
+ int endOffset;
+
+ public WeightedFragInfo( int startOffset, int endOffset, List phraseInfoList ){
+ this.startOffset = startOffset;
+ this.endOffset = endOffset;
+ subInfos = new ArrayList();
+ for( WeightedPhraseInfo phraseInfo : phraseInfoList ){
+ SubInfo subInfo = new SubInfo( phraseInfo.text, phraseInfo.termsOffsets, phraseInfo.seqnum );
+ subInfos.add( subInfo );
+ totalBoost += phraseInfo.boost;
+ }
+ }
+
+ public List getSubInfos(){
+ return subInfos;
+ }
+
+ public float getTotalBoost(){
+ return totalBoost;
+ }
+
+ public int getStartOffset(){
+ return startOffset;
+ }
+
+ public int getEndOffset(){
+ return endOffset;
+ }
+
+ @Override
+ public String toString(){
+ StringBuilder sb = new StringBuilder();
+ sb.append( "subInfos=(" );
+ for( SubInfo si : subInfos )
+ sb.append( si.toString() );
+ sb.append( ")/" ).append( totalBoost ).append( '(' ).append( startOffset ).append( ',' ).append( endOffset ).append( ')' );
+ return sb.toString();
+ }
+
+ public static class SubInfo {
+ final String text; // unnecessary member, just exists for debugging purpose
+ final List termsOffsets; // usually termsOffsets.size() == 1,
+ // but if position-gap > 1 and slop > 0 then size() could be greater than 1
+ int seqnum;
+
+ SubInfo( String text, List termsOffsets, int seqnum ){
+ this.text = text;
+ this.termsOffsets = termsOffsets;
+ this.seqnum = seqnum;
+ }
+
+ public List getTermsOffsets(){
+ return termsOffsets;
+ }
+
+ public int getSeqnum(){
+ return seqnum;
+ }
+
+ @Override
+ public String toString(){
+ StringBuilder sb = new StringBuilder();
+ sb.append( text ).append( '(' );
+ for( Toffs to : termsOffsets )
+ sb.append( to.toString() );
+ sb.append( ')' );
+ return sb.toString();
+ }
+ }
+ }
+}
Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/termvector/FieldPhraseList.java
===================================================================
--- lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/termvector/FieldPhraseList.java (revision 0)
+++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/termvector/FieldPhraseList.java (revision 0)
@@ -0,0 +1,191 @@
+package org.apache.lucene.search.highlight.termvector;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.ArrayList;
+import java.util.LinkedList;
+import java.util.List;
+
+import org.apache.lucene.search.highlight.termvector.FieldQuery.QueryPhraseMap;
+import org.apache.lucene.search.highlight.termvector.FieldTermStack.TermInfo;
+
+/**
+ * FieldPhraseList has a list of WeightedPhraseInfo that is used by FragListBuilder
+ * to create a FieldFragList object.
+ */
+public class FieldPhraseList {
+
+ LinkedList phraseList = new LinkedList();
+
+ /**
+ * a constructor.
+ *
+ * @param fieldTermStack FieldTermStack object
+ * @param fieldQuery FieldQuery object
+ */
+ public FieldPhraseList( FieldTermStack fieldTermStack, FieldQuery fieldQuery ){
+ final String field = fieldTermStack.getFieldName();
+
+ LinkedList phraseCandidate = new LinkedList();
+ QueryPhraseMap currMap = null;
+ QueryPhraseMap nextMap = null;
+ while( !fieldTermStack.isEmpty() ){
+
+ phraseCandidate.clear();
+
+ TermInfo ti = fieldTermStack.pop();
+ currMap = fieldQuery.getFieldTermMap( field, ti.getText() );
+
+ // if not found, discard top TermInfo from stack, then try next element
+ if( currMap == null ) continue;
+
+ // if found, search the longest phrase
+ phraseCandidate.add( ti );
+ while( true ){
+ ti = fieldTermStack.pop();
+ nextMap = null;
+ if( ti != null )
+ nextMap = currMap.getTermMap( ti.getText() );
+ if( ti == null || nextMap == null ){
+ if( ti != null )
+ fieldTermStack.push( ti );
+ if( currMap.isValidTermOrPhrase( phraseCandidate ) ){
+ addIfNoOverlap( new WeightedPhraseInfo( phraseCandidate, currMap.getBoost(), currMap.getTermOrPhraseNumber() ) );
+ }
+ else{
+ while( phraseCandidate.size() > 1 ){
+ fieldTermStack.push( phraseCandidate.removeLast() );
+ currMap = fieldQuery.searchPhrase( field, phraseCandidate );
+ if( currMap != null ){
+ addIfNoOverlap( new WeightedPhraseInfo( phraseCandidate, currMap.getBoost(), currMap.getTermOrPhraseNumber() ) );
+ break;
+ }
+ }
+ }
+ break;
+ }
+ else{
+ phraseCandidate.add( ti );
+ currMap = nextMap;
+ }
+ }
+ }
+ }
+
+ void addIfNoOverlap( WeightedPhraseInfo wpi ){
+ for( WeightedPhraseInfo existWpi : phraseList ){
+ if( existWpi.isOffsetOverlap( wpi ) ) return;
+ }
+ phraseList.add( wpi );
+ }
+
+ public static class WeightedPhraseInfo {
+
+ String text; // unnecessary member, just exists for debugging purpose
+ List termsOffsets; // usually termsOffsets.size() == 1,
+ // but if position-gap > 1 and slop > 0 then size() could be greater than 1
+ float boost; // query boost
+ int seqnum;
+
+ public WeightedPhraseInfo( LinkedList terms, float boost ){
+ this( terms, boost, 0 );
+ }
+
+ public WeightedPhraseInfo( LinkedList terms, float boost, int number ){
+ this.boost = boost;
+ this.seqnum = number;
+ termsOffsets = new ArrayList( terms.size() );
+ TermInfo ti = terms.get( 0 );
+ termsOffsets.add( new Toffs( ti.getStartOffset(), ti.getEndOffset() ) );
+ if( terms.size() == 1 ){
+ text = ti.getText();
+ return;
+ }
+ StringBuilder sb = new StringBuilder();
+ sb.append( ti.getText() );
+ int pos = ti.getPosition();
+ for( int i = 1; i < terms.size(); i++ ){
+ ti = terms.get( i );
+ sb.append( ti.getText() );
+ if( ti.getPosition() - pos == 1 ){
+ Toffs to = termsOffsets.get( termsOffsets.size() - 1 );
+ to.setEndOffset( ti.getEndOffset() );
+ }
+ else{
+ termsOffsets.add( new Toffs( ti.getStartOffset(), ti.getEndOffset() ) );
+ }
+ pos = ti.getPosition();
+ }
+ text = sb.toString();
+ }
+
+ public int getStartOffset(){
+ return termsOffsets.get( 0 ).startOffset;
+ }
+
+ public int getEndOffset(){
+ return termsOffsets.get( termsOffsets.size() - 1 ).endOffset;
+ }
+
+ public boolean isOffsetOverlap( WeightedPhraseInfo other ){
+ int so = getStartOffset();
+ int eo = getEndOffset();
+ int oso = other.getStartOffset();
+ int oeo = other.getEndOffset();
+ if( so <= oso && oso < eo ) return true;
+ if( so < oeo && oeo <= eo ) return true;
+ if( oso <= so && so < oeo ) return true;
+ if( oso < eo && eo <= oeo ) return true;
+ return false;
+ }
+
+ @Override
+ public String toString(){
+ StringBuilder sb = new StringBuilder();
+ sb.append( text ).append( '(' ).append( boost ).append( ")(" );
+ for( Toffs to : termsOffsets ){
+ sb.append( to );
+ }
+ sb.append( ')' );
+ return sb.toString();
+ }
+
+ public static class Toffs {
+ int startOffset;
+ int endOffset;
+ public Toffs( int startOffset, int endOffset ){
+ this.startOffset = startOffset;
+ this.endOffset = endOffset;
+ }
+ public void setEndOffset( int endOffset ){
+ this.endOffset = endOffset;
+ }
+ public int getStartOffset(){
+ return startOffset;
+ }
+ public int getEndOffset(){
+ return endOffset;
+ }
+ @Override
+ public String toString(){
+ StringBuilder sb = new StringBuilder();
+ sb.append( '(' ).append( startOffset ).append( ',' ).append( endOffset ).append( ')' );
+ return sb.toString();
+ }
+ }
+ }
+}
Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/termvector/FieldQuery.java
===================================================================
--- lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/termvector/FieldQuery.java (revision 0)
+++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/termvector/FieldQuery.java (revision 0)
@@ -0,0 +1,399 @@
+package org.apache.lucene.search.highlight.termvector;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.BooleanClause;
+import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.DisjunctionMaxQuery;
+import org.apache.lucene.search.PhraseQuery;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.highlight.termvector.FieldTermStack.TermInfo;
+
+/**
+ * FieldQuery breaks down query object into terms/phrases and keep
+ * them in QueryPhraseMap structure.
+ */
+public class FieldQuery {
+
+ final boolean fieldMatch;
+
+ // fieldMatch==true, Map
+ // fieldMatch==false, Map
+ Map rootMaps = new HashMap();
+
+ // fieldMatch==true, Map
+ // fieldMatch==false, Map
+ Map> termSetMap = new HashMap>();
+
+ int termOrPhraseNumber; // used for colored tag support
+
+ FieldQuery( Query query, boolean phraseHighlight, boolean fieldMatch ){
+ this.fieldMatch = fieldMatch;
+ Set flatQueries = new HashSet();
+ flatten( query, flatQueries );
+ saveTerms( flatQueries );
+ Collection expandQueries = expand( flatQueries );
+
+ for( Query flatQuery : expandQueries ){
+ QueryPhraseMap rootMap = getRootMap( flatQuery );
+ rootMap.add( flatQuery );
+ if( !phraseHighlight && flatQuery instanceof PhraseQuery ){
+ PhraseQuery pq = (PhraseQuery)flatQuery;
+ if( pq.getTerms().length > 1 ){
+ for( Term term : pq.getTerms() )
+ rootMap.addTerm( term, flatQuery.getBoost() );
+ }
+ }
+ }
+ }
+
+ void flatten( Query sourceQuery, Collection flatQueries ){
+ if( sourceQuery instanceof BooleanQuery ){
+ BooleanQuery bq = (BooleanQuery)sourceQuery;
+ for( BooleanClause clause : bq.getClauses() ){
+ if( !clause.isProhibited() )
+ flatten( clause.getQuery(), flatQueries );
+ }
+ }
+ else if( sourceQuery instanceof DisjunctionMaxQuery ){
+ DisjunctionMaxQuery dmq = (DisjunctionMaxQuery)sourceQuery;
+ for( Query query : dmq ){
+ flatten( query, flatQueries );
+ }
+ }
+ else if( sourceQuery instanceof TermQuery ){
+ if( !flatQueries.contains( sourceQuery ) )
+ flatQueries.add( sourceQuery );
+ }
+ else if( sourceQuery instanceof PhraseQuery ){
+ if( !flatQueries.contains( sourceQuery ) ){
+ PhraseQuery pq = (PhraseQuery)sourceQuery;
+ if( pq.getTerms().length > 1 )
+ flatQueries.add( pq );
+ else if( pq.getTerms().length == 1 ){
+ flatQueries.add( new TermQuery( pq.getTerms()[0] ) );
+ }
+ }
+ }
+ // else discard queries
+ }
+
+ /*
+ * Create expandQueries from flatQueries.
+ *
+ * expandQueries := flatQueries + overlapped phrase queries
+ *
+ * ex1) flatQueries={a,b,c}
+ * => expandQueries={a,b,c}
+ * ex2) flatQueries={a,"b c","c d"}
+ * => expandQueries={a,"b c","c d","b c d"}
+ */
+ Collection expand( Collection flatQueries ){
+ Set expandQueries = new HashSet();
+ for( Iterator i = flatQueries.iterator(); i.hasNext(); ){
+ Query query = i.next();
+ i.remove();
+ expandQueries.add( query );
+ if( !( query instanceof PhraseQuery ) ) continue;
+ for( Iterator j = flatQueries.iterator(); j.hasNext(); ){
+ Query qj = j.next();
+ if( !( qj instanceof PhraseQuery ) ) continue;
+ checkOverlap( expandQueries, (PhraseQuery)query, (PhraseQuery)qj );
+ }
+ }
+ return expandQueries;
+ }
+
+ /*
+ * Check if PhraseQuery A and B have overlapped part.
+ *
+ * ex1) A="a b", B="b c" => overlap; expandQueries={"a b c"}
+ * ex2) A="b c", B="a b" => overlap; expandQueries={"a b c"}
+ * ex3) A="a b", B="c d" => no overlap; expandQueries={}
+ */
+ private void checkOverlap( Collection expandQueries, PhraseQuery a, PhraseQuery b ){
+ if( a.getSlop() != b.getSlop() ) return;
+ Term[] ats = a.getTerms();
+ Term[] bts = b.getTerms();
+ if( fieldMatch && !ats[0].field().equals( bts[0].field() ) ) return;
+ checkOverlap( expandQueries, ats, bts, a.getSlop(), a.getBoost() );
+ checkOverlap( expandQueries, bts, ats, b.getSlop(), b.getBoost() );
+ }
+
+ /*
+ * Check if src and dest have overlapped part and if it is, create PhraseQueries and add expandQueries.
+ *
+ * ex1) src="a b", dest="c d" => no overlap
+ * ex2) src="a b", dest="a b c" => no overlap
+ * ex3) src="a b", dest="b c" => overlap; expandQueries={"a b c"}
+ * ex4) src="a b c", dest="b c d" => overlap; expandQueries={"a b c d"}
+ * ex5) src="a b c", dest="b c" => no overlap
+ * ex6) src="a b c", dest="b" => no overlap
+ * ex7) src="a a a a", dest="a a a" => overlap;
+ * expandQueries={"a a a a a","a a a a a a"}
+ * ex8) src="a b c d", dest="b c" => no overlap
+ */
+ private void checkOverlap( Collection expandQueries, Term[] src, Term[] dest, int slop, float boost ){
+ // beginning from 1 (not 0) is safe because that the PhraseQuery has multiple terms
+ // is guaranteed in flatten() method (if PhraseQuery has only one term, flatten()
+ // converts PhraseQuery to TermQuery)
+ for( int i = 1; i < src.length; i++ ){
+ boolean overlap = true;
+ for( int j = i; j < src.length; j++ ){
+ if( ( j - i ) < dest.length && !src[j].text().equals( dest[j-i].text() ) ){
+ overlap = false;
+ break;
+ }
+ }
+ if( overlap && src.length - i < dest.length ){
+ PhraseQuery pq = new PhraseQuery();
+ for( Term srcTerm : src )
+ pq.add( srcTerm );
+ for( int k = src.length - i; k < dest.length; k++ ){
+ pq.add( new Term( src[0].field(), dest[k].text() ) );
+ }
+ pq.setSlop( slop );
+ pq.setBoost( boost );
+ if(!expandQueries.contains( pq ) )
+ expandQueries.add( pq );
+ }
+ }
+ }
+
+ QueryPhraseMap getRootMap( Query query ){
+ String key = getKey( query );
+ QueryPhraseMap map = rootMaps.get( key );
+ if( map == null ){
+ map = new QueryPhraseMap( this );
+ rootMaps.put( key, map );
+ }
+ return map;
+ }
+
+ /*
+ * Return 'key' string. 'key' is the field name of the Query.
+ * If not fieldMatch, 'key' will be null.
+ */
+ private String getKey( Query query ){
+ if( !fieldMatch ) return null;
+ if( query instanceof TermQuery )
+ return ((TermQuery)query).getTerm().field();
+ else if ( query instanceof PhraseQuery ){
+ PhraseQuery pq = (PhraseQuery)query;
+ Term[] terms = pq.getTerms();
+ return terms[0].field();
+ }
+ else
+ throw new RuntimeException( "query \"" + query.toString() + "\" must be flatten first." );
+ }
+
+ /*
+ * Save the set of terms in the queries to termSetMap.
+ *
+ * ex1) q=name:john
+ * - fieldMatch==true
+ * termSetMap=Map<"name",Set<"john">>
+ * - fieldMatch==false
+ * termSetMap=Map>
+ *
+ * ex2) q=name:john title:manager
+ * - fieldMatch==true
+ * termSetMap=Map<"name",Set<"john">,
+ * "title",Set<"manager">>
+ * - fieldMatch==false
+ * termSetMap=Map>
+ *
+ * ex3) q=name:"john lennon"
+ * - fieldMatch==true
+ * termSetMap=Map<"name",Set<"john","lennon">>
+ * - fieldMatch==false
+ * termSetMap=Map>
+ */
+ void saveTerms( Collection flatQueries ){
+ for( Query query : flatQueries ){
+ Set termSet = getTermSet( query );
+ if( query instanceof TermQuery )
+ termSet.add( ((TermQuery)query).getTerm().text() );
+ else if( query instanceof PhraseQuery ){
+ for( Term term : ((PhraseQuery)query).getTerms() )
+ termSet.add( term.text() );
+ }
+ else
+ throw new RuntimeException( "query \"" + query.toString() + "\" must be flatten first." );
+ }
+ }
+
+ private Set getTermSet( Query query ){
+ String key = getKey( query );
+ Set set = termSetMap.get( key );
+ if( set == null ){
+ set = new HashSet();
+ termSetMap.put( key, set );
+ }
+ return set;
+ }
+
+ Set getTermSet( String field ){
+ return termSetMap.get( fieldMatch ? field : null );
+ }
+
+ /**
+ *
+ * @param fieldName
+ * @param term
+ * @return QueryPhraseMap
+ */
+ public QueryPhraseMap getFieldTermMap( String fieldName, String term ){
+ QueryPhraseMap rootMap = getRootMap( fieldName );
+ return rootMap == null ? null : rootMap.subMap.get( term );
+ }
+
+ /**
+ *
+ * @param fieldName
+ * @param phraseCandidate
+ * @return QueryPhraseMap
+ */
+ public QueryPhraseMap searchPhrase( String fieldName, final List phraseCandidate ){
+ QueryPhraseMap root = getRootMap( fieldName );
+ if( root == null ) return null;
+ return root.searchPhrase( phraseCandidate );
+ }
+
+ private QueryPhraseMap getRootMap( String fieldName ){
+ return rootMaps.get( fieldMatch ? fieldName : null );
+ }
+
+ int nextTermOrPhraseNumber(){
+ return termOrPhraseNumber++;
+ }
+
+ public static class QueryPhraseMap {
+
+ boolean terminal;
+ int slop; // valid if terminal == true and phraseHighlight == true
+ float boost; // valid if terminal == true
+ int termOrPhraseNumber; // valid if terminal == true
+ FieldQuery fieldQuery;
+ Map subMap = new HashMap();
+
+ public QueryPhraseMap( FieldQuery fieldQuery ){
+ this.fieldQuery = fieldQuery;
+ }
+
+ void addTerm( Term term, float boost ){
+ QueryPhraseMap map = getOrNewMap( subMap, term.text() );
+ map.markTerminal( boost );
+ }
+
+ private QueryPhraseMap getOrNewMap( Map subMap, String term ){
+ QueryPhraseMap map = subMap.get( term );
+ if( map == null ){
+ map = new QueryPhraseMap( fieldQuery );
+ subMap.put( term, map );
+ }
+ return map;
+ }
+
+ void add( Query query ){
+ if( query instanceof TermQuery ){
+ addTerm( ((TermQuery)query).getTerm(), query.getBoost() );
+ }
+ else if( query instanceof PhraseQuery ){
+ PhraseQuery pq = (PhraseQuery)query;
+ Term[] terms = pq.getTerms();
+ Map map = subMap;
+ QueryPhraseMap qpm = null;
+ for( Term term : terms ){
+ qpm = getOrNewMap( map, term.text() );
+ map = qpm.subMap;
+ }
+ qpm.markTerminal( pq.getSlop(), pq.getBoost() );
+ }
+ else
+ throw new RuntimeException( "query \"" + query.toString() + "\" must be flatten first." );
+ }
+
+ public QueryPhraseMap getTermMap( String term ){
+ return subMap.get( term );
+ }
+
+ private void markTerminal( float boost ){
+ markTerminal( 0, boost );
+ }
+
+ private void markTerminal( int slop, float boost ){
+ this.terminal = true;
+ this.slop = slop;
+ this.boost = boost;
+ this.termOrPhraseNumber = fieldQuery.nextTermOrPhraseNumber();
+ }
+
+ public boolean isTerminal(){
+ return terminal;
+ }
+
+ public int getSlop(){
+ return slop;
+ }
+
+ public float getBoost(){
+ return boost;
+ }
+
+ public int getTermOrPhraseNumber(){
+ return termOrPhraseNumber;
+ }
+
+ public QueryPhraseMap searchPhrase( final List phraseCandidate ){
+ QueryPhraseMap currMap = this;
+ for( TermInfo ti : phraseCandidate ){
+ currMap = currMap.subMap.get( ti.getText() );
+ if( currMap == null ) return null;
+ }
+ return currMap.isValidTermOrPhrase( phraseCandidate ) ? currMap : null;
+ }
+
+ public boolean isValidTermOrPhrase( final List phraseCandidate ){
+ // check terminal
+ if( !terminal ) return false;
+
+ // if the candidate is a term, it is valid
+ if( phraseCandidate.size() == 1 ) return true;
+
+ // else check whether the candidate is valid phrase
+ // compare position-gaps between terms to slop
+ int pos = phraseCandidate.get( 0 ).getPosition();
+ for( int i = 1; i < phraseCandidate.size(); i++ ){
+ int nextPos = phraseCandidate.get( i ).getPosition();
+ if( Math.abs( nextPos - pos - 1 ) > slop ) return false;
+ pos = nextPos;
+ }
+ return true;
+ }
+ }
+}
Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/termvector/FieldTermStack.java
===================================================================
--- lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/termvector/FieldTermStack.java (revision 0)
+++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/termvector/FieldTermStack.java (revision 0)
@@ -0,0 +1,159 @@
+package org.apache.lucene.search.highlight.termvector;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.Collections;
+import java.util.LinkedList;
+import java.util.Set;
+
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.TermFreqVector;
+import org.apache.lucene.index.TermPositionVector;
+import org.apache.lucene.index.TermVectorOffsetInfo;
+
+/**
+ * FieldTermStack is a stack that keeps query terms in the specified field
+ * of the document to be highlighted.
+ */
+public class FieldTermStack {
+
+ private final String fieldName;
+ LinkedList termList = new LinkedList();
+
+ //public static void main( String[] args ) throws Exception {
+ // Analyzer analyzer = new WhitespaceAnalyzer(Version.LUCENE_CURRENT);
+ // QueryParser parser = new QueryParser(Version.LUCENE_CURRENT, "f", analyzer );
+ // Query query = parser.parse( "a x:b" );
+ // FieldQuery fieldQuery = new FieldQuery( query, true, false );
+
+ // Directory dir = new RAMDirectory();
+ // IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(Version.LUCENE_CURRENT, analyzer));
+ // Document doc = new Document();
+ // doc.add( new Field( "f", "a a a b b c a b b c d e f", Store.YES, Index.ANALYZED, TermVector.WITH_POSITIONS_OFFSETS ) );
+ // doc.add( new Field( "f", "b a b a f", Store.YES, Index.ANALYZED, TermVector.WITH_POSITIONS_OFFSETS ) );
+ // writer.addDocument( doc );
+ // writer.close();
+
+ // IndexReader reader = IndexReader.open( dir, true );
+ // new FieldTermStack( reader, 0, "f", fieldQuery );
+ // reader.close();
+ //}
+
+ /**
+ * a constructor.
+ *
+ * @param reader IndexReader of the index
+ * @param docId document id to be highlighted
+ * @param fieldName field of the document to be highlighted
+ * @param fieldQuery FieldQuery object
+ * @throws IOException
+ */
+ public FieldTermStack( IndexReader reader, int docId, String fieldName, final FieldQuery fieldQuery ) throws IOException {
+ this.fieldName = fieldName;
+
+ TermFreqVector tfv = reader.getTermFreqVector( docId, fieldName );
+ if( tfv == null ) return; // just return to make null snippets
+ TermPositionVector tpv = null;
+ try{
+ tpv = (TermPositionVector)tfv;
+ }
+ catch( ClassCastException e ){
+ return; // just return to make null snippets
+ }
+
+ Set termSet = fieldQuery.getTermSet( fieldName );
+ // just return to make null snippet if un-matched fieldName specified when fieldMatch == true
+ if( termSet == null ) return;
+
+ for( String term : tpv.getTerms() ){
+ if( !termSet.contains( term ) ) continue;
+ int index = tpv.indexOf( term );
+ TermVectorOffsetInfo[] tvois = tpv.getOffsets( index );
+ if( tvois == null ) return; // just return to make null snippets
+ int[] poss = tpv.getTermPositions( index );
+ if( poss == null ) return; // just return to make null snippets
+ for( int i = 0; i < tvois.length; i++ )
+ termList.add( new TermInfo( term, tvois[i].getStartOffset(), tvois[i].getEndOffset(), poss[i] ) );
+ }
+
+ // sort by position
+ Collections.sort( termList );
+ }
+
+ /**
+ * @return field name
+ */
+ public String getFieldName(){
+ return fieldName;
+ }
+
+ /**
+ * @return the top TermInfo object of the stack
+ */
+ public TermInfo pop(){
+ return termList.poll();
+ }
+
+ /**
+ * @param termInfo the TermInfo object to be put on the top of the stack
+ */
+ public void push( TermInfo termInfo ){
+ // termList.push( termInfo ); // avoid Java 1.6 feature
+ termList.addFirst( termInfo );
+ }
+
+ /**
+ * to know whether the stack is empty
+ *
+ * @return true if the stack is empty, false if not
+ */
+ public boolean isEmpty(){
+ return termList == null || termList.size() == 0;
+ }
+
+ public static class TermInfo implements Comparable{
+
+ final String text;
+ final int startOffset;
+ final int endOffset;
+ final int position;
+
+ TermInfo( String text, int startOffset, int endOffset, int position ){
+ this.text = text;
+ this.startOffset = startOffset;
+ this.endOffset = endOffset;
+ this.position = position;
+ }
+
+ public String getText(){ return text; }
+ public int getStartOffset(){ return startOffset; }
+ public int getEndOffset(){ return endOffset; }
+ public int getPosition(){ return position; }
+
+ @Override
+ public String toString(){
+ StringBuilder sb = new StringBuilder();
+ sb.append( text ).append( '(' ).append(startOffset).append( ',' ).append( endOffset ).append( ',' ).append( position ).append( ')' );
+ return sb.toString();
+ }
+
+ public int compareTo( TermInfo o ) {
+ return ( this.position - o.position );
+ }
+ }
+}
Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/termvector/FragListBuilder.java
===================================================================
--- lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/termvector/FragListBuilder.java (revision 0)
+++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/termvector/FragListBuilder.java (revision 0)
@@ -0,0 +1,34 @@
+package org.apache.lucene.search.highlight.termvector;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * FragListBuilder is an interface for FieldFragList builder classes.
+ * A FragListBuilder class can be plugged in to Highlighter.
+ */
+public interface FragListBuilder {
+
+ /**
+ * create a FieldFragList.
+ *
+ * @param fieldPhraseList FieldPhraseList object
+ * @param fragCharSize the length (number of chars) of a fragment
+ * @return the created FieldFragList object
+ */
+ public FieldFragList createFieldFragList( FieldPhraseList fieldPhraseList, int fragCharSize );
+}
Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/termvector/FragmentsBuilder.java
===================================================================
--- lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/termvector/FragmentsBuilder.java (revision 0)
+++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/termvector/FragmentsBuilder.java (revision 0)
@@ -0,0 +1,57 @@
+package org.apache.lucene.search.highlight.termvector;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.index.IndexReader;
+
+/**
+ * FragmentsBuilder is an interface for fragments (snippets) builder classes.
+ * A FragmentsBuilder class can be plugged in to Highlighter.
+ */
+public interface FragmentsBuilder {
+
+ /**
+ * create a fragment.
+ *
+ * @param reader IndexReader of the index
+ * @param docId document id to be highlighted
+ * @param fieldName field of the document to be highlighted
+ * @param fieldFragList FieldFragList object
+ * @return a created fragment or null when no fragment created
+ * @throws IOException
+ */
+ public String createFragment( IndexReader reader, int docId, String fieldName,
+ FieldFragList fieldFragList ) throws IOException;
+
+ /**
+ * create multiple fragments.
+ *
+ * @param reader IndexReader of the index
+ * @param docId document id to be highlighter
+ * @param fieldName field of the document to be highlighted
+ * @param fieldFragList FieldFragList object
+ * @param maxNumFragments maximum number of fragments
+ * @return created fragments or null when no fragments created.
+ * size of the array can be less than maxNumFragments
+ * @throws IOException
+ */
+ public String[] createFragments( IndexReader reader, int docId, String fieldName,
+ FieldFragList fieldFragList, int maxNumFragments ) throws IOException;
+}
Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/termvector/ScoreOrderFragmentsBuilder.java
===================================================================
--- lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/termvector/ScoreOrderFragmentsBuilder.java (revision 0)
+++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/termvector/ScoreOrderFragmentsBuilder.java (revision 0)
@@ -0,0 +1,68 @@
+package org.apache.lucene.search.highlight.termvector;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.List;
+
+import org.apache.lucene.search.highlight.formatting.Formatter;
+import org.apache.lucene.search.highlight.termvector.FieldFragList.WeightedFragInfo;
+
+/**
+ * An implementation of FragmentsBuilder that outputs score-order fragments.
+ */
+public class ScoreOrderFragmentsBuilder extends BaseFragmentsBuilder {
+
+ /**
+ * a constructor.
+ */
+ public ScoreOrderFragmentsBuilder(){
+ super();
+ }
+
+ /**
+ * a constructor.
+ */
+ public ScoreOrderFragmentsBuilder(Formatter formatter){
+ super(formatter);
+ }
+
+ /**
+ * Sort by score the list of WeightedFragInfo
+ */
+ @Override
+ public List getWeightedFragInfoList( List src ) {
+ Collections.sort( src, new ScoreComparator() );
+ return src;
+ }
+
+ public static class ScoreComparator implements Comparator {
+
+ public int compare( WeightedFragInfo o1, WeightedFragInfo o2 ) {
+ if( o1.totalBoost > o2.totalBoost ) return -1;
+ else if( o1.totalBoost < o2.totalBoost ) return 1;
+ // if same score then check startOffset
+ else{
+ if( o1.startOffset < o2.startOffset ) return -1;
+ else if( o1.startOffset > o2.startOffset ) return 1;
+ }
+ return 0;
+ }
+ }
+}
Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/termvector/SimpleFragListBuilder.java
===================================================================
--- lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/termvector/SimpleFragListBuilder.java (revision 0)
+++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/termvector/SimpleFragListBuilder.java (revision 0)
@@ -0,0 +1,84 @@
+package org.apache.lucene.search.highlight.termvector;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+
+import org.apache.lucene.search.highlight.termvector.FieldPhraseList.WeightedPhraseInfo;
+
+/**
+ * A simple implementation of FragListBuilder.
+ */
+public class SimpleFragListBuilder implements FragListBuilder {
+
+ public static final int MARGIN = 6;
+ public static final int MIN_FRAG_CHAR_SIZE = MARGIN * 3;
+
+ public FieldFragList createFieldFragList(FieldPhraseList fieldPhraseList, int fragCharSize) {
+ if( fragCharSize < MIN_FRAG_CHAR_SIZE )
+ throw new IllegalArgumentException( "fragCharSize(" + fragCharSize + ") is too small. It must be " +
+ MIN_FRAG_CHAR_SIZE + " or higher." );
+
+ FieldFragList ffl = new FieldFragList( fragCharSize );
+
+ List wpil = new ArrayList();
+ Iterator ite = fieldPhraseList.phraseList.iterator();
+ WeightedPhraseInfo phraseInfo = null;
+ int startOffset = 0;
+ boolean taken = false;
+ while( true ){
+ if( !taken ){
+ if( !ite.hasNext() ) break;
+ phraseInfo = ite.next();
+ }
+ taken = false;
+ if( phraseInfo == null ) break;
+
+ // if the phrase violates the border of previous fragment, discard it and try next phrase
+ if( phraseInfo.getStartOffset() < startOffset ) continue;
+
+ wpil.clear();
+ wpil.add( phraseInfo );
+ int st = phraseInfo.getStartOffset() - MARGIN < startOffset ?
+ startOffset : phraseInfo.getStartOffset() - MARGIN;
+ int en = st + fragCharSize;
+ if( phraseInfo.getEndOffset() > en )
+ en = phraseInfo.getEndOffset();
+ startOffset = en;
+
+ while( true ){
+ if( ite.hasNext() ){
+ phraseInfo = ite.next();
+ taken = true;
+ if( phraseInfo == null ) break;
+ }
+ else
+ break;
+ if( phraseInfo.getEndOffset() <= en )
+ wpil.add( phraseInfo );
+ else
+ break;
+ }
+ ffl.add( st, en, wpil );
+ }
+ return ffl;
+ }
+
+}
Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/termvector/SimpleFragmentsBuilder.java
===================================================================
--- lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/termvector/SimpleFragmentsBuilder.java (revision 0)
+++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/termvector/SimpleFragmentsBuilder.java (revision 0)
@@ -0,0 +1,52 @@
+package org.apache.lucene.search.highlight.termvector;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.List;
+
+import org.apache.lucene.search.highlight.formatting.Formatter;
+import org.apache.lucene.search.highlight.termvector.FieldFragList.WeightedFragInfo;
+
+/**
+ * A simple implementation of FragmentsBuilder.
+ *
+ */
+public class SimpleFragmentsBuilder extends BaseFragmentsBuilder {
+
+ /**
+ * a constructor.
+ */
+ public SimpleFragmentsBuilder() {
+ super();
+ }
+
+ /**
+ * a constructor.
+ */
+ public SimpleFragmentsBuilder(Formatter formatter) {
+ super(formatter);
+ }
+
+ /**
+ * do nothing. return the source list.
+ */
+ @Override
+ public List getWeightedFragInfoList( List src ) {
+ return src;
+ }
+}
Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/termvector/package.html
===================================================================
--- lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/termvector/package.html (revision 0)
+++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/termvector/package.html (revision 0)
@@ -0,0 +1,143 @@
+
+
+
+
+This is an another highlighter implementation.
+
+Features
+
+- fast for large docs
+- support N-gram fields
+- support phrase-unit highlighting with slops
+- need Java 1.5
+- highlight fields need to be TermVector.WITH_POSITIONS_OFFSETS
+- take into account query boost to score fragments
+- support colored highlight tags
+- pluggable FragListBuilder
+- pluggable FragmentsBuilder
+
+
+Algorithm
+To explain the algorithm, let's use the following sample text
+ (to be highlighted) and user query:
+
+
+
+| Sample Text |
+Lucene is a search engine library. |
+
+
+| User Query |
+Lucene^2 OR "search library"~1 |
+
+
+
+The user query is a BooleanQuery that consists of TermQuery("Lucene")
+with boost of 2 and PhraseQuery("search library") with slop of 1.
+For your convenience, here is the offsets and positions info of the
+sample text.
+
+
++--------+-----------------------------------+
+| | 1111111111222222222233333|
+| offset|01234567890123456789012345678901234|
++--------+-----------------------------------+
+|document|Lucene is a search engine library. |
++--------*-----------------------------------+
+|position|0 1 2 3 4 5 |
++--------*-----------------------------------+
+
+
+Step 1.
+In Step 1, Fast Vector Highlighter generates {@link org.apache.lucene.search.vectorhighlight.FieldQuery.QueryPhraseMap} from the user query.
+QueryPhraseMap consists of the following members:
+
+public class QueryPhraseMap {
+ boolean terminal;
+ int slop; // valid if terminal == true and phraseHighlight == true
+ float boost; // valid if terminal == true
+ Map<String, QueryPhraseMap> subMap;
+}
+
+QueryPhraseMap has subMap. The key of the subMap is a term
+text in the user query and the value is a subsequent QueryPhraseMap.
+If the query is a term (not phrase), then the subsequent QueryPhraseMap
+is marked as terminal. If the query is a phrase, then the subsequent QueryPhraseMap
+is not a terminal and it has the next term text in the phrase.
+
+From the sample user query, the following QueryPhraseMap
+will be generated:
+
+ QueryPhraseMap
++--------+-+ +-------+-+
+|"Lucene"|o+->|boost=2|*| * : terminal
++--------+-+ +-------+-+
+
++--------+-+ +---------+-+ +-------+------+-+
+|"search"|o+->|"library"|o+->|boost=1|slop=1|*|
++--------+-+ +---------+-+ +-------+------+-+
+
+
+Step 2.
+In Step 2, Fast Vector Highlighter generates {@link org.apache.lucene.search.vectorhighlight.FieldTermStack}. Fast Vector Highlighter uses {@link org.apache.lucene.index.TermFreqVector} data
+(must be stored {@link org.apache.lucene.document.Field.TermVector#WITH_POSITIONS_OFFSETS})
+to generate it. FieldTermStack keeps the terms in the user query.
+Therefore, in this sample case, Fast Vector Highlighter generates the following FieldTermStack:
+
+ FieldTermStack
++------------------+
+|"Lucene"(0,6,0) |
++------------------+
+|"search"(12,18,3) |
++------------------+
+|"library"(26,33,5)|
++------------------+
+where : "termText"(startOffset,endOffset,position)
+
+Step 3.
+In Step 3, Fast Vector Highlighter generates {@link org.apache.lucene.search.vectorhighlight.FieldPhraseList}
+by reference to QueryPhraseMap and FieldTermStack.
+
+ FieldPhraseList
++----------------+-----------------+---+
+|"Lucene" |[(0,6)] |w=2|
++----------------+-----------------+---+
+|"search library"|[(12,18),(26,33)]|w=1|
++----------------+-----------------+---+
+
+The type of each entry is WeightedPhraseInfo that consists of
+an array of terms offsets and weight. The weight (Fast Vector Highlighter uses query boost to
+calculate the weight) will be taken into account when Fast Vector Highlighter creates
+{@link org.apache.lucene.search.vectorhighlight.FieldFragList} in the next step.
+Step 4.
+In Step 4, Fast Vector Highlighter creates FieldFragList by reference to
+FieldPhraseList. In this sample case, the following
+FieldFragList will be generated:
+
+ FieldFragList
++---------------------------------+
+|"Lucene"[(0,6)] |
+|"search library"[(12,18),(26,33)]|
+|totalBoost=3 |
++---------------------------------+
+
+Step 5.
+In Step 5, by using FieldFragList and the field stored data,
+Fast Vector Highlighter creates highlighted snippets!
+
+
Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/SpanGradientFormatter.java
===================================================================
--- lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/SpanGradientFormatter.java (revision 956773)
+++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/SpanGradientFormatter.java (working copy)
@@ -1,78 +0,0 @@
-package org.apache.lucene.search.highlight;
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * Formats text with different color intensity depending on the score of the
- * term using the span tag. GradientFormatter uses a bgcolor argument to the font tag which
- * doesn't work in Mozilla, thus this class.
- *
- * @see GradientFormatter
- *
- */
-
-public class SpanGradientFormatter
- extends GradientFormatter
-{
- public SpanGradientFormatter(float maxScore, String minForegroundColor,
- String maxForegroundColor, String minBackgroundColor,
- String maxBackgroundColor)
- {
- super( maxScore, minForegroundColor,
- maxForegroundColor, minBackgroundColor,
- maxBackgroundColor);
- }
-
-
-
- @Override
- public String highlightTerm(String originalText, TokenGroup tokenGroup)
- {
- if (tokenGroup.getTotalScore() == 0)
- return originalText;
- float score = tokenGroup.getTotalScore();
- if (score == 0)
- {
- return originalText;
- }
-
- // try to size sb correctly
- StringBuilder sb = new StringBuilder( originalText.length() + EXTRA);
-
- sb.append("");
- sb.append(originalText);
- sb.append("");
- return sb.toString();
- }
-
- // guess how much extra text we'll add to the text we're highlighting to try to avoid a StringBuilder resize
- private static final String TEMPLATE = "...";
- private static final int EXTRA = TEMPLATE.length();
-}
Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldTermStack.java
===================================================================
--- lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldTermStack.java (revision 956773)
+++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldTermStack.java (working copy)
@@ -1,159 +0,0 @@
-package org.apache.lucene.search.vectorhighlight;
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.io.IOException;
-import java.util.Collections;
-import java.util.LinkedList;
-import java.util.Set;
-
-import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.TermFreqVector;
-import org.apache.lucene.index.TermPositionVector;
-import org.apache.lucene.index.TermVectorOffsetInfo;
-
-/**
- * FieldTermStack is a stack that keeps query terms in the specified field
- * of the document to be highlighted.
- */
-public class FieldTermStack {
-
- private final String fieldName;
- LinkedList termList = new LinkedList();
-
- //public static void main( String[] args ) throws Exception {
- // Analyzer analyzer = new WhitespaceAnalyzer(Version.LUCENE_CURRENT);
- // QueryParser parser = new QueryParser(Version.LUCENE_CURRENT, "f", analyzer );
- // Query query = parser.parse( "a x:b" );
- // FieldQuery fieldQuery = new FieldQuery( query, true, false );
-
- // Directory dir = new RAMDirectory();
- // IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(Version.LUCENE_CURRENT, analyzer));
- // Document doc = new Document();
- // doc.add( new Field( "f", "a a a b b c a b b c d e f", Store.YES, Index.ANALYZED, TermVector.WITH_POSITIONS_OFFSETS ) );
- // doc.add( new Field( "f", "b a b a f", Store.YES, Index.ANALYZED, TermVector.WITH_POSITIONS_OFFSETS ) );
- // writer.addDocument( doc );
- // writer.close();
-
- // IndexReader reader = IndexReader.open( dir, true );
- // new FieldTermStack( reader, 0, "f", fieldQuery );
- // reader.close();
- //}
-
- /**
- * a constructor.
- *
- * @param reader IndexReader of the index
- * @param docId document id to be highlighted
- * @param fieldName field of the document to be highlighted
- * @param fieldQuery FieldQuery object
- * @throws IOException
- */
- public FieldTermStack( IndexReader reader, int docId, String fieldName, final FieldQuery fieldQuery ) throws IOException {
- this.fieldName = fieldName;
-
- TermFreqVector tfv = reader.getTermFreqVector( docId, fieldName );
- if( tfv == null ) return; // just return to make null snippets
- TermPositionVector tpv = null;
- try{
- tpv = (TermPositionVector)tfv;
- }
- catch( ClassCastException e ){
- return; // just return to make null snippets
- }
-
- Set termSet = fieldQuery.getTermSet( fieldName );
- // just return to make null snippet if un-matched fieldName specified when fieldMatch == true
- if( termSet == null ) return;
-
- for( String term : tpv.getTerms() ){
- if( !termSet.contains( term ) ) continue;
- int index = tpv.indexOf( term );
- TermVectorOffsetInfo[] tvois = tpv.getOffsets( index );
- if( tvois == null ) return; // just return to make null snippets
- int[] poss = tpv.getTermPositions( index );
- if( poss == null ) return; // just return to make null snippets
- for( int i = 0; i < tvois.length; i++ )
- termList.add( new TermInfo( term, tvois[i].getStartOffset(), tvois[i].getEndOffset(), poss[i] ) );
- }
-
- // sort by position
- Collections.sort( termList );
- }
-
- /**
- * @return field name
- */
- public String getFieldName(){
- return fieldName;
- }
-
- /**
- * @return the top TermInfo object of the stack
- */
- public TermInfo pop(){
- return termList.poll();
- }
-
- /**
- * @param termInfo the TermInfo object to be put on the top of the stack
- */
- public void push( TermInfo termInfo ){
- // termList.push( termInfo ); // avoid Java 1.6 feature
- termList.addFirst( termInfo );
- }
-
- /**
- * to know whether the stack is empty
- *
- * @return true if the stack is empty, false if not
- */
- public boolean isEmpty(){
- return termList == null || termList.size() == 0;
- }
-
- public static class TermInfo implements Comparable{
-
- final String text;
- final int startOffset;
- final int endOffset;
- final int position;
-
- TermInfo( String text, int startOffset, int endOffset, int position ){
- this.text = text;
- this.startOffset = startOffset;
- this.endOffset = endOffset;
- this.position = position;
- }
-
- public String getText(){ return text; }
- public int getStartOffset(){ return startOffset; }
- public int getEndOffset(){ return endOffset; }
- public int getPosition(){ return position; }
-
- @Override
- public String toString(){
- StringBuilder sb = new StringBuilder();
- sb.append( text ).append( '(' ).append(startOffset).append( ',' ).append( endOffset ).append( ',' ).append( position ).append( ')' );
- return sb.toString();
- }
-
- public int compareTo( TermInfo o ) {
- return ( this.position - o.position );
- }
- }
-}
Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldQuery.java
===================================================================
--- lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldQuery.java (revision 956773)
+++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldQuery.java (working copy)
@@ -1,399 +0,0 @@
-package org.apache.lucene.search.vectorhighlight;
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.util.Collection;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
-
-import org.apache.lucene.index.Term;
-import org.apache.lucene.search.BooleanClause;
-import org.apache.lucene.search.BooleanQuery;
-import org.apache.lucene.search.DisjunctionMaxQuery;
-import org.apache.lucene.search.PhraseQuery;
-import org.apache.lucene.search.Query;
-import org.apache.lucene.search.TermQuery;
-import org.apache.lucene.search.vectorhighlight.FieldTermStack.TermInfo;
-
-/**
- * FieldQuery breaks down query object into terms/phrases and keep
- * them in QueryPhraseMap structure.
- */
-public class FieldQuery {
-
- final boolean fieldMatch;
-
- // fieldMatch==true, Map
- // fieldMatch==false, Map
- Map rootMaps = new HashMap();
-
- // fieldMatch==true, Map
- // fieldMatch==false, Map
- Map> termSetMap = new HashMap>();
-
- int termOrPhraseNumber; // used for colored tag support
-
- FieldQuery( Query query, boolean phraseHighlight, boolean fieldMatch ){
- this.fieldMatch = fieldMatch;
- Set flatQueries = new HashSet();
- flatten( query, flatQueries );
- saveTerms( flatQueries );
- Collection expandQueries = expand( flatQueries );
-
- for( Query flatQuery : expandQueries ){
- QueryPhraseMap rootMap = getRootMap( flatQuery );
- rootMap.add( flatQuery );
- if( !phraseHighlight && flatQuery instanceof PhraseQuery ){
- PhraseQuery pq = (PhraseQuery)flatQuery;
- if( pq.getTerms().length > 1 ){
- for( Term term : pq.getTerms() )
- rootMap.addTerm( term, flatQuery.getBoost() );
- }
- }
- }
- }
-
- void flatten( Query sourceQuery, Collection flatQueries ){
- if( sourceQuery instanceof BooleanQuery ){
- BooleanQuery bq = (BooleanQuery)sourceQuery;
- for( BooleanClause clause : bq.getClauses() ){
- if( !clause.isProhibited() )
- flatten( clause.getQuery(), flatQueries );
- }
- }
- else if( sourceQuery instanceof DisjunctionMaxQuery ){
- DisjunctionMaxQuery dmq = (DisjunctionMaxQuery)sourceQuery;
- for( Query query : dmq ){
- flatten( query, flatQueries );
- }
- }
- else if( sourceQuery instanceof TermQuery ){
- if( !flatQueries.contains( sourceQuery ) )
- flatQueries.add( sourceQuery );
- }
- else if( sourceQuery instanceof PhraseQuery ){
- if( !flatQueries.contains( sourceQuery ) ){
- PhraseQuery pq = (PhraseQuery)sourceQuery;
- if( pq.getTerms().length > 1 )
- flatQueries.add( pq );
- else if( pq.getTerms().length == 1 ){
- flatQueries.add( new TermQuery( pq.getTerms()[0] ) );
- }
- }
- }
- // else discard queries
- }
-
- /*
- * Create expandQueries from flatQueries.
- *
- * expandQueries := flatQueries + overlapped phrase queries
- *
- * ex1) flatQueries={a,b,c}
- * => expandQueries={a,b,c}
- * ex2) flatQueries={a,"b c","c d"}
- * => expandQueries={a,"b c","c d","b c d"}
- */
- Collection expand( Collection flatQueries ){
- Set expandQueries = new HashSet();
- for( Iterator i = flatQueries.iterator(); i.hasNext(); ){
- Query query = i.next();
- i.remove();
- expandQueries.add( query );
- if( !( query instanceof PhraseQuery ) ) continue;
- for( Iterator j = flatQueries.iterator(); j.hasNext(); ){
- Query qj = j.next();
- if( !( qj instanceof PhraseQuery ) ) continue;
- checkOverlap( expandQueries, (PhraseQuery)query, (PhraseQuery)qj );
- }
- }
- return expandQueries;
- }
-
- /*
- * Check if PhraseQuery A and B have overlapped part.
- *
- * ex1) A="a b", B="b c" => overlap; expandQueries={"a b c"}
- * ex2) A="b c", B="a b" => overlap; expandQueries={"a b c"}
- * ex3) A="a b", B="c d" => no overlap; expandQueries={}
- */
- private void checkOverlap( Collection expandQueries, PhraseQuery a, PhraseQuery b ){
- if( a.getSlop() != b.getSlop() ) return;
- Term[] ats = a.getTerms();
- Term[] bts = b.getTerms();
- if( fieldMatch && !ats[0].field().equals( bts[0].field() ) ) return;
- checkOverlap( expandQueries, ats, bts, a.getSlop(), a.getBoost() );
- checkOverlap( expandQueries, bts, ats, b.getSlop(), b.getBoost() );
- }
-
- /*
- * Check if src and dest have overlapped part and if it is, create PhraseQueries and add expandQueries.
- *
- * ex1) src="a b", dest="c d" => no overlap
- * ex2) src="a b", dest="a b c" => no overlap
- * ex3) src="a b", dest="b c" => overlap; expandQueries={"a b c"}
- * ex4) src="a b c", dest="b c d" => overlap; expandQueries={"a b c d"}
- * ex5) src="a b c", dest="b c" => no overlap
- * ex6) src="a b c", dest="b" => no overlap
- * ex7) src="a a a a", dest="a a a" => overlap;
- * expandQueries={"a a a a a","a a a a a a"}
- * ex8) src="a b c d", dest="b c" => no overlap
- */
- private void checkOverlap( Collection expandQueries, Term[] src, Term[] dest, int slop, float boost ){
- // beginning from 1 (not 0) is safe because that the PhraseQuery has multiple terms
- // is guaranteed in flatten() method (if PhraseQuery has only one term, flatten()
- // converts PhraseQuery to TermQuery)
- for( int i = 1; i < src.length; i++ ){
- boolean overlap = true;
- for( int j = i; j < src.length; j++ ){
- if( ( j - i ) < dest.length && !src[j].text().equals( dest[j-i].text() ) ){
- overlap = false;
- break;
- }
- }
- if( overlap && src.length - i < dest.length ){
- PhraseQuery pq = new PhraseQuery();
- for( Term srcTerm : src )
- pq.add( srcTerm );
- for( int k = src.length - i; k < dest.length; k++ ){
- pq.add( new Term( src[0].field(), dest[k].text() ) );
- }
- pq.setSlop( slop );
- pq.setBoost( boost );
- if(!expandQueries.contains( pq ) )
- expandQueries.add( pq );
- }
- }
- }
-
- QueryPhraseMap getRootMap( Query query ){
- String key = getKey( query );
- QueryPhraseMap map = rootMaps.get( key );
- if( map == null ){
- map = new QueryPhraseMap( this );
- rootMaps.put( key, map );
- }
- return map;
- }
-
- /*
- * Return 'key' string. 'key' is the field name of the Query.
- * If not fieldMatch, 'key' will be null.
- */
- private String getKey( Query query ){
- if( !fieldMatch ) return null;
- if( query instanceof TermQuery )
- return ((TermQuery)query).getTerm().field();
- else if ( query instanceof PhraseQuery ){
- PhraseQuery pq = (PhraseQuery)query;
- Term[] terms = pq.getTerms();
- return terms[0].field();
- }
- else
- throw new RuntimeException( "query \"" + query.toString() + "\" must be flatten first." );
- }
-
- /*
- * Save the set of terms in the queries to termSetMap.
- *
- * ex1) q=name:john
- * - fieldMatch==true
- * termSetMap=Map<"name",Set<"john">>
- * - fieldMatch==false
- * termSetMap=Map>
- *
- * ex2) q=name:john title:manager
- * - fieldMatch==true
- * termSetMap=Map<"name",Set<"john">,
- * "title",Set<"manager">>
- * - fieldMatch==false
- * termSetMap=Map>
- *
- * ex3) q=name:"john lennon"
- * - fieldMatch==true
- * termSetMap=Map<"name",Set<"john","lennon">>
- * - fieldMatch==false
- * termSetMap=Map>
- */
- void saveTerms( Collection flatQueries ){
- for( Query query : flatQueries ){
- Set termSet = getTermSet( query );
- if( query instanceof TermQuery )
- termSet.add( ((TermQuery)query).getTerm().text() );
- else if( query instanceof PhraseQuery ){
- for( Term term : ((PhraseQuery)query).getTerms() )
- termSet.add( term.text() );
- }
- else
- throw new RuntimeException( "query \"" + query.toString() + "\" must be flatten first." );
- }
- }
-
- private Set getTermSet( Query query ){
- String key = getKey( query );
- Set set = termSetMap.get( key );
- if( set == null ){
- set = new HashSet();
- termSetMap.put( key, set );
- }
- return set;
- }
-
- Set getTermSet( String field ){
- return termSetMap.get( fieldMatch ? field : null );
- }
-
- /**
- *
- * @param fieldName
- * @param term
- * @return QueryPhraseMap
- */
- public QueryPhraseMap getFieldTermMap( String fieldName, String term ){
- QueryPhraseMap rootMap = getRootMap( fieldName );
- return rootMap == null ? null : rootMap.subMap.get( term );
- }
-
- /**
- *
- * @param fieldName
- * @param phraseCandidate
- * @return QueryPhraseMap
- */
- public QueryPhraseMap searchPhrase( String fieldName, final List phraseCandidate ){
- QueryPhraseMap root = getRootMap( fieldName );
- if( root == null ) return null;
- return root.searchPhrase( phraseCandidate );
- }
-
- private QueryPhraseMap getRootMap( String fieldName ){
- return rootMaps.get( fieldMatch ? fieldName : null );
- }
-
- int nextTermOrPhraseNumber(){
- return termOrPhraseNumber++;
- }
-
- public static class QueryPhraseMap {
-
- boolean terminal;
- int slop; // valid if terminal == true and phraseHighlight == true
- float boost; // valid if terminal == true
- int termOrPhraseNumber; // valid if terminal == true
- FieldQuery fieldQuery;
- Map subMap = new HashMap();
-
- public QueryPhraseMap( FieldQuery fieldQuery ){
- this.fieldQuery = fieldQuery;
- }
-
- void addTerm( Term term, float boost ){
- QueryPhraseMap map = getOrNewMap( subMap, term.text() );
- map.markTerminal( boost );
- }
-
- private QueryPhraseMap getOrNewMap( Map subMap, String term ){
- QueryPhraseMap map = subMap.get( term );
- if( map == null ){
- map = new QueryPhraseMap( fieldQuery );
- subMap.put( term, map );
- }
- return map;
- }
-
- void add( Query query ){
- if( query instanceof TermQuery ){
- addTerm( ((TermQuery)query).getTerm(), query.getBoost() );
- }
- else if( query instanceof PhraseQuery ){
- PhraseQuery pq = (PhraseQuery)query;
- Term[] terms = pq.getTerms();
- Map map = subMap;
- QueryPhraseMap qpm = null;
- for( Term term : terms ){
- qpm = getOrNewMap( map, term.text() );
- map = qpm.subMap;
- }
- qpm.markTerminal( pq.getSlop(), pq.getBoost() );
- }
- else
- throw new RuntimeException( "query \"" + query.toString() + "\" must be flatten first." );
- }
-
- public QueryPhraseMap getTermMap( String term ){
- return subMap.get( term );
- }
-
- private void markTerminal( float boost ){
- markTerminal( 0, boost );
- }
-
- private void markTerminal( int slop, float boost ){
- this.terminal = true;
- this.slop = slop;
- this.boost = boost;
- this.termOrPhraseNumber = fieldQuery.nextTermOrPhraseNumber();
- }
-
- public boolean isTerminal(){
- return terminal;
- }
-
- public int getSlop(){
- return slop;
- }
-
- public float getBoost(){
- return boost;
- }
-
- public int getTermOrPhraseNumber(){
- return termOrPhraseNumber;
- }
-
- public QueryPhraseMap searchPhrase( final List phraseCandidate ){
- QueryPhraseMap currMap = this;
- for( TermInfo ti : phraseCandidate ){
- currMap = currMap.subMap.get( ti.getText() );
- if( currMap == null ) return null;
- }
- return currMap.isValidTermOrPhrase( phraseCandidate ) ? currMap : null;
- }
-
- public boolean isValidTermOrPhrase( final List phraseCandidate ){
- // check terminal
- if( !terminal ) return false;
-
- // if the candidate is a term, it is valid
- if( phraseCandidate.size() == 1 ) return true;
-
- // else check whether the candidate is valid phrase
- // compare position-gaps between terms to slop
- int pos = phraseCandidate.get( 0 ).getPosition();
- for( int i = 1; i < phraseCandidate.size(); i++ ){
- int nextPos = phraseCandidate.get( i ).getPosition();
- if( Math.abs( nextPos - pos - 1 ) > slop ) return false;
- pos = nextPos;
- }
- return true;
- }
- }
-}
Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/BaseFragmentsBuilder.java
===================================================================
--- lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/BaseFragmentsBuilder.java (revision 956773)
+++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/BaseFragmentsBuilder.java (working copy)
@@ -1,154 +0,0 @@
-package org.apache.lucene.search.vectorhighlight;
-
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.List;
-
-import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Field;
-import org.apache.lucene.document.MapFieldSelector;
-import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.search.vectorhighlight.FieldFragList.WeightedFragInfo;
-import org.apache.lucene.search.vectorhighlight.FieldFragList.WeightedFragInfo.SubInfo;
-import org.apache.lucene.search.vectorhighlight.FieldPhraseList.WeightedPhraseInfo.Toffs;
-
-public abstract class BaseFragmentsBuilder implements FragmentsBuilder {
-
- protected String[] preTags, postTags;
- public static final String[] COLORED_PRE_TAGS = {
- "", "", "",
- "", "", "",
- "", "", "",
- ""
- };
- public static final String[] COLORED_POST_TAGS = { "" };
-
- protected BaseFragmentsBuilder(){
- this( new String[]{ "" }, new String[]{ "" } );
- }
-
- protected BaseFragmentsBuilder( String[] preTags, String[] postTags ){
- this.preTags = preTags;
- this.postTags = postTags;
- }
-
- static Object checkTagsArgument( Object tags ){
- if( tags instanceof String ) return tags;
- else if( tags instanceof String[] ) return tags;
- throw new IllegalArgumentException( "type of preTags/postTags must be a String or String[]" );
- }
-
- public abstract List getWeightedFragInfoList( List src );
-
- public String createFragment( IndexReader reader, int docId,
- String fieldName, FieldFragList fieldFragList ) throws IOException {
- String[] fragments = createFragments( reader, docId, fieldName, fieldFragList, 1 );
- if( fragments == null || fragments.length == 0 ) return null;
- return fragments[0];
- }
-
- public String[] createFragments( IndexReader reader, int docId,
- String fieldName, FieldFragList fieldFragList, int maxNumFragments )
- throws IOException {
- if( maxNumFragments < 0 )
- throw new IllegalArgumentException( "maxNumFragments(" + maxNumFragments + ") must be positive number." );
-
- List fragInfos = getWeightedFragInfoList( fieldFragList.fragInfos );
-
- List fragments = new ArrayList( maxNumFragments );
- Field[] values = getFields( reader, docId, fieldName );
- if( values.length == 0 ) return null;
- StringBuilder buffer = new StringBuilder();
- int[] nextValueIndex = { 0 };
- for( int n = 0; n < maxNumFragments && n < fragInfos.size(); n++ ){
- WeightedFragInfo fragInfo = fragInfos.get( n );
- fragments.add( makeFragment( buffer, nextValueIndex, values, fragInfo ) );
- }
- return fragments.toArray( new String[fragments.size()] );
- }
-
- @Deprecated
- protected String[] getFieldValues( IndexReader reader, int docId, String fieldName) throws IOException {
- Document doc = reader.document( docId, new MapFieldSelector( new String[]{ fieldName } ) );
- return doc.getValues( fieldName ); // according to Document class javadoc, this never returns null
- }
-
- protected Field[] getFields( IndexReader reader, int docId, String fieldName) throws IOException {
- // according to javadoc, doc.getFields(fieldName) cannot be used with lazy loaded field???
- Document doc = reader.document( docId, new MapFieldSelector( new String[]{ fieldName } ) );
- return doc.getFields( fieldName ); // according to Document class javadoc, this never returns null
- }
-
- @Deprecated
- protected String makeFragment( StringBuilder buffer, int[] index, String[] values, WeightedFragInfo fragInfo ){
- final int s = fragInfo.startOffset;
- return makeFragment( fragInfo, getFragmentSource( buffer, index, values, s, fragInfo.endOffset ), s );
- }
-
- protected String makeFragment( StringBuilder buffer, int[] index, Field[] values, WeightedFragInfo fragInfo ){
- final int s = fragInfo.startOffset;
- return makeFragment( fragInfo, getFragmentSource( buffer, index, values, s, fragInfo.endOffset ), s );
- }
-
- private String makeFragment( WeightedFragInfo fragInfo, String src, int s ){
- StringBuilder fragment = new StringBuilder();
- int srcIndex = 0;
- for( SubInfo subInfo : fragInfo.subInfos ){
- for( Toffs to : subInfo.termsOffsets ){
- fragment.append( src.substring( srcIndex, to.startOffset - s ) ).append( getPreTag( subInfo.seqnum ) )
- .append( src.substring( to.startOffset - s, to.endOffset - s ) ).append( getPostTag( subInfo.seqnum ) );
- srcIndex = to.endOffset - s;
- }
- }
- fragment.append( src.substring( srcIndex ) );
- return fragment.toString();
- }
-
- @Deprecated
- protected String getFragmentSource( StringBuilder buffer, int[] index, String[] values,
- int startOffset, int endOffset ){
- while( buffer.length() < endOffset && index[0] < values.length ){
- if( index[0] > 0 && values[index[0]].length() > 0 )
- buffer.append( ' ' );
- buffer.append( values[index[0]++] );
- }
- int eo = buffer.length() < endOffset ? buffer.length() : endOffset;
- return buffer.substring( startOffset, eo );
- }
-
- protected String getFragmentSource( StringBuilder buffer, int[] index, Field[] values,
- int startOffset, int endOffset ){
- while( buffer.length() < endOffset && index[0] < values.length ){
- if( index[0] > 0 && values[index[0]].isTokenized() && values[index[0]].stringValue().length() > 0 )
- buffer.append( ' ' );
- buffer.append( values[index[0]++].stringValue() );
- }
- int eo = buffer.length() < endOffset ? buffer.length() : endOffset;
- return buffer.substring( startOffset, eo );
- }
-
- protected String getPreTag( int num ){
- return preTags.length > num ? preTags[num] : preTags[0];
- }
-
- protected String getPostTag( int num ){
- return postTags.length > num ? postTags[num] : postTags[0];
- }
-}
Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/ScoreOrderFragmentsBuilder.java
===================================================================
--- lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/ScoreOrderFragmentsBuilder.java (revision 956773)
+++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/ScoreOrderFragmentsBuilder.java (working copy)
@@ -1,70 +0,0 @@
-package org.apache.lucene.search.vectorhighlight;
-
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.util.Collections;
-import java.util.Comparator;
-import java.util.List;
-
-import org.apache.lucene.search.vectorhighlight.FieldFragList.WeightedFragInfo;
-
-/**
- * An implementation of FragmentsBuilder that outputs score-order fragments.
- */
-public class ScoreOrderFragmentsBuilder extends BaseFragmentsBuilder {
-
- /**
- * a constructor.
- */
- public ScoreOrderFragmentsBuilder(){
- super();
- }
-
- /**
- * a constructor.
- *
- * @param preTags array of pre-tags for markup terms.
- * @param postTags array of post-tags for markup terms.
- */
- public ScoreOrderFragmentsBuilder( String[] preTags, String[] postTags ){
- super( preTags, postTags );
- }
-
- /**
- * Sort by score the list of WeightedFragInfo
- */
- @Override
- public List getWeightedFragInfoList( List src ) {
- Collections.sort( src, new ScoreComparator() );
- return src;
- }
-
- public static class ScoreComparator implements Comparator {
-
- public int compare( WeightedFragInfo o1, WeightedFragInfo o2 ) {
- if( o1.totalBoost > o2.totalBoost ) return -1;
- else if( o1.totalBoost < o2.totalBoost ) return 1;
- // if same score then check startOffset
- else{
- if( o1.startOffset < o2.startOffset ) return -1;
- else if( o1.startOffset > o2.startOffset ) return 1;
- }
- return 0;
- }
- }
-}
Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/package.html
===================================================================
--- lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/package.html (revision 956773)
+++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/package.html (working copy)
@@ -1,143 +0,0 @@
-
-
-
-
-This is an another highlighter implementation.
-
-Features
-
-- fast for large docs
-- support N-gram fields
-- support phrase-unit highlighting with slops
-- need Java 1.5
-- highlight fields need to be TermVector.WITH_POSITIONS_OFFSETS
-- take into account query boost to score fragments
-- support colored highlight tags
-- pluggable FragListBuilder
-- pluggable FragmentsBuilder
-
-
-Algorithm
-To explain the algorithm, let's use the following sample text
- (to be highlighted) and user query:
-
-
-
-| Sample Text |
-Lucene is a search engine library. |
-
-
-| User Query |
-Lucene^2 OR "search library"~1 |
-
-
-
-The user query is a BooleanQuery that consists of TermQuery("Lucene")
-with boost of 2 and PhraseQuery("search library") with slop of 1.
-For your convenience, here is the offsets and positions info of the
-sample text.
-
-
-+--------+-----------------------------------+
-| | 1111111111222222222233333|
-| offset|01234567890123456789012345678901234|
-+--------+-----------------------------------+
-|document|Lucene is a search engine library. |
-+--------*-----------------------------------+
-|position|0 1 2 3 4 5 |
-+--------*-----------------------------------+
-
-
-Step 1.
-In Step 1, Fast Vector Highlighter generates {@link org.apache.lucene.search.vectorhighlight.FieldQuery.QueryPhraseMap} from the user query.
-QueryPhraseMap consists of the following members:
-
-public class QueryPhraseMap {
- boolean terminal;
- int slop; // valid if terminal == true and phraseHighlight == true
- float boost; // valid if terminal == true
- Map<String, QueryPhraseMap> subMap;
-}
-
-QueryPhraseMap has subMap. The key of the subMap is a term
-text in the user query and the value is a subsequent QueryPhraseMap.
-If the query is a term (not phrase), then the subsequent QueryPhraseMap
-is marked as terminal. If the query is a phrase, then the subsequent QueryPhraseMap
-is not a terminal and it has the next term text in the phrase.
-
-From the sample user query, the following QueryPhraseMap
-will be generated:
-
- QueryPhraseMap
-+--------+-+ +-------+-+
-|"Lucene"|o+->|boost=2|*| * : terminal
-+--------+-+ +-------+-+
-
-+--------+-+ +---------+-+ +-------+------+-+
-|"search"|o+->|"library"|o+->|boost=1|slop=1|*|
-+--------+-+ +---------+-+ +-------+------+-+
-
-
-Step 2.
-In Step 2, Fast Vector Highlighter generates {@link org.apache.lucene.search.vectorhighlight.FieldTermStack}. Fast Vector Highlighter uses {@link org.apache.lucene.index.TermFreqVector} data
-(must be stored {@link org.apache.lucene.document.Field.TermVector#WITH_POSITIONS_OFFSETS})
-to generate it. FieldTermStack keeps the terms in the user query.
-Therefore, in this sample case, Fast Vector Highlighter generates the following FieldTermStack:
-
- FieldTermStack
-+------------------+
-|"Lucene"(0,6,0) |
-+------------------+
-|"search"(12,18,3) |
-+------------------+
-|"library"(26,33,5)|
-+------------------+
-where : "termText"(startOffset,endOffset,position)
-
-Step 3.
-In Step 3, Fast Vector Highlighter generates {@link org.apache.lucene.search.vectorhighlight.FieldPhraseList}
-by reference to QueryPhraseMap and FieldTermStack.
-
- FieldPhraseList
-+----------------+-----------------+---+
-|"Lucene" |[(0,6)] |w=2|
-+----------------+-----------------+---+
-|"search library"|[(12,18),(26,33)]|w=1|
-+----------------+-----------------+---+
-
-The type of each entry is WeightedPhraseInfo that consists of
-an array of terms offsets and weight. The weight (Fast Vector Highlighter uses query boost to
-calculate the weight) will be taken into account when Fast Vector Highlighter creates
-{@link org.apache.lucene.search.vectorhighlight.FieldFragList} in the next step.
-Step 4.
-In Step 4, Fast Vector Highlighter creates FieldFragList by reference to
-FieldPhraseList. In this sample case, the following
-FieldFragList will be generated:
-
- FieldFragList
-+---------------------------------+
-|"Lucene"[(0,6)] |
-|"search library"[(12,18),(26,33)]|
-|totalBoost=3 |
-+---------------------------------+
-
-Step 5.
-In Step 5, by using FieldFragList and the field stored data,
-Fast Vector Highlighter creates highlighted snippets!
-
-
Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/SimpleFragmentsBuilder.java
===================================================================
--- lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/SimpleFragmentsBuilder.java (revision 956773)
+++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/SimpleFragmentsBuilder.java (working copy)
@@ -1,54 +0,0 @@
-package org.apache.lucene.search.vectorhighlight;
-
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.util.List;
-
-import org.apache.lucene.search.vectorhighlight.FieldFragList.WeightedFragInfo;
-
-/**
- * A simple implementation of FragmentsBuilder.
- *
- */
-public class SimpleFragmentsBuilder extends BaseFragmentsBuilder {
-
- /**
- * a constructor.
- */
- public SimpleFragmentsBuilder() {
- super();
- }
-
- /**
- * a constructor.
- *
- * @param preTags array of pre-tags for markup terms.
- * @param postTags array of post-tags for markup terms.
- */
- public SimpleFragmentsBuilder( String[] preTags, String[] postTags ) {
- super( preTags, postTags );
- }
-
- /**
- * do nothing. return the source list.
- */
- @Override
- public List getWeightedFragInfoList( List src ) {
- return src;
- }
-}
Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FastVectorHighlighter.java
===================================================================
--- lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FastVectorHighlighter.java (revision 956773)
+++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FastVectorHighlighter.java (working copy)
@@ -1,137 +0,0 @@
-package org.apache.lucene.search.vectorhighlight;
-
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.io.IOException;
-
-import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.search.Query;
-
-/**
- * Another highlighter implementation.
- *
- */
-public class FastVectorHighlighter {
-
- public static final boolean DEFAULT_PHRASE_HIGHLIGHT = true;
- public static final boolean DEFAULT_FIELD_MATCH = true;
- private final boolean phraseHighlight;
- private final boolean fieldMatch;
- private final FragListBuilder fragListBuilder;
- private final FragmentsBuilder fragmentsBuilder;
-
- /**
- * the default constructor.
- */
- public FastVectorHighlighter(){
- this( DEFAULT_PHRASE_HIGHLIGHT, DEFAULT_FIELD_MATCH );
- }
-
- /**
- * a constructor. Using SimpleFragListBuilder and ScoreOrderFragmentsBuilder.
- *
- * @param phraseHighlight true or false for phrase highlighting
- * @param fieldMatch true of false for field matching
- */
- public FastVectorHighlighter( boolean phraseHighlight, boolean fieldMatch ){
- this( phraseHighlight, fieldMatch, new SimpleFragListBuilder(), new ScoreOrderFragmentsBuilder() );
- }
-
- /**
- * a constructor. A FragListBuilder and a FragmentsBuilder can be specified (plugins).
- *
- * @param phraseHighlight true of false for phrase highlighting
- * @param fieldMatch true of false for field matching
- * @param fragListBuilder an instance of FragListBuilder
- * @param fragmentsBuilder an instance of FragmentsBuilder
- */
- public FastVectorHighlighter( boolean phraseHighlight, boolean fieldMatch,
- FragListBuilder fragListBuilder, FragmentsBuilder fragmentsBuilder ){
- this.phraseHighlight = phraseHighlight;
- this.fieldMatch = fieldMatch;
- this.fragListBuilder = fragListBuilder;
- this.fragmentsBuilder = fragmentsBuilder;
- }
-
- /**
- * create a FieldQuery object.
- *
- * @param query a query
- * @return the created FieldQuery object
- */
- public FieldQuery getFieldQuery( Query query ){
- return new FieldQuery( query, phraseHighlight, fieldMatch );
- }
-
- /**
- * return the best fragment.
- *
- * @param fieldQuery FieldQuery object
- * @param reader IndexReader of the index
- * @param docId document id to be highlighted
- * @param fieldName field of the document to be highlighted
- * @param fragCharSize the length (number of chars) of a fragment
- * @return the best fragment (snippet) string
- * @throws IOException
- */
- public final String getBestFragment( final FieldQuery fieldQuery, IndexReader reader, int docId,
- String fieldName, int fragCharSize ) throws IOException {
- FieldFragList fieldFragList = getFieldFragList( fieldQuery, reader, docId, fieldName, fragCharSize );
- return fragmentsBuilder.createFragment( reader, docId, fieldName, fieldFragList );
- }
-
- /**
- * return the best fragments.
- *
- * @param fieldQuery FieldQuery object
- * @param reader IndexReader of the index
- * @param docId document id to be highlighted
- * @param fieldName field of the document to be highlighted
- * @param fragCharSize the length (number of chars) of a fragment
- * @param maxNumFragments maximum number of fragments
- * @return created fragments or null when no fragments created.
- * size of the array can be less than maxNumFragments
- * @throws IOException
- */
- public final String[] getBestFragments( final FieldQuery fieldQuery, IndexReader reader, int docId,
- String fieldName, int fragCharSize, int maxNumFragments ) throws IOException {
- FieldFragList fieldFragList = getFieldFragList( fieldQuery, reader, docId, fieldName, fragCharSize );
- return fragmentsBuilder.createFragments( reader, docId, fieldName, fieldFragList, maxNumFragments );
- }
-
- private FieldFragList getFieldFragList( final FieldQuery fieldQuery, IndexReader reader, int docId,
- String fieldName, int fragCharSize ) throws IOException {
- FieldTermStack fieldTermStack = new FieldTermStack( reader, docId, fieldName, fieldQuery );
- FieldPhraseList fieldPhraseList = new FieldPhraseList( fieldTermStack, fieldQuery );
- return fragListBuilder.createFieldFragList( fieldPhraseList, fragCharSize );
- }
-
- /**
- * return whether phraseHighlight or not.
- *
- * @return whether phraseHighlight or not
- */
- public boolean isPhraseHighlight(){ return phraseHighlight; }
-
- /**
- * return whether fieldMatch or not.
- *
- * @return whether fieldMatch or not
- */
- public boolean isFieldMatch(){ return fieldMatch; }
-}
Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FragListBuilder.java
===================================================================
--- lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FragListBuilder.java (revision 956773)
+++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FragListBuilder.java (working copy)
@@ -1,34 +0,0 @@
-package org.apache.lucene.search.vectorhighlight;
-
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * FragListBuilder is an interface for FieldFragList builder classes.
- * A FragListBuilder class can be plugged in to Highlighter.
- */
-public interface FragListBuilder {
-
- /**
- * create a FieldFragList.
- *
- * @param fieldPhraseList FieldPhraseList object
- * @param fragCharSize the length (number of chars) of a fragment
- * @return the created FieldFragList object
- */
- public FieldFragList createFieldFragList( FieldPhraseList fieldPhraseList, int fragCharSize );
-}
Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldPhraseList.java
===================================================================
--- lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldPhraseList.java (revision 956773)
+++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldPhraseList.java (working copy)
@@ -1,191 +0,0 @@
-package org.apache.lucene.search.vectorhighlight;
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.util.ArrayList;
-import java.util.LinkedList;
-import java.util.List;
-
-import org.apache.lucene.search.vectorhighlight.FieldQuery.QueryPhraseMap;
-import org.apache.lucene.search.vectorhighlight.FieldTermStack.TermInfo;
-
-/**
- * FieldPhraseList has a list of WeightedPhraseInfo that is used by FragListBuilder
- * to create a FieldFragList object.
- */
-public class FieldPhraseList {
-
- LinkedList phraseList = new LinkedList();
-
- /**
- * a constructor.
- *
- * @param fieldTermStack FieldTermStack object
- * @param fieldQuery FieldQuery object
- */
- public FieldPhraseList( FieldTermStack fieldTermStack, FieldQuery fieldQuery ){
- final String field = fieldTermStack.getFieldName();
-
- LinkedList phraseCandidate = new LinkedList();
- QueryPhraseMap currMap = null;
- QueryPhraseMap nextMap = null;
- while( !fieldTermStack.isEmpty() ){
-
- phraseCandidate.clear();
-
- TermInfo ti = fieldTermStack.pop();
- currMap = fieldQuery.getFieldTermMap( field, ti.getText() );
-
- // if not found, discard top TermInfo from stack, then try next element
- if( currMap == null ) continue;
-
- // if found, search the longest phrase
- phraseCandidate.add( ti );
- while( true ){
- ti = fieldTermStack.pop();
- nextMap = null;
- if( ti != null )
- nextMap = currMap.getTermMap( ti.getText() );
- if( ti == null || nextMap == null ){
- if( ti != null )
- fieldTermStack.push( ti );
- if( currMap.isValidTermOrPhrase( phraseCandidate ) ){
- addIfNoOverlap( new WeightedPhraseInfo( phraseCandidate, currMap.getBoost(), currMap.getTermOrPhraseNumber() ) );
- }
- else{
- while( phraseCandidate.size() > 1 ){
- fieldTermStack.push( phraseCandidate.removeLast() );
- currMap = fieldQuery.searchPhrase( field, phraseCandidate );
- if( currMap != null ){
- addIfNoOverlap( new WeightedPhraseInfo( phraseCandidate, currMap.getBoost(), currMap.getTermOrPhraseNumber() ) );
- break;
- }
- }
- }
- break;
- }
- else{
- phraseCandidate.add( ti );
- currMap = nextMap;
- }
- }
- }
- }
-
- void addIfNoOverlap( WeightedPhraseInfo wpi ){
- for( WeightedPhraseInfo existWpi : phraseList ){
- if( existWpi.isOffsetOverlap( wpi ) ) return;
- }
- phraseList.add( wpi );
- }
-
- public static class WeightedPhraseInfo {
-
- String text; // unnecessary member, just exists for debugging purpose
- List termsOffsets; // usually termsOffsets.size() == 1,
- // but if position-gap > 1 and slop > 0 then size() could be greater than 1
- float boost; // query boost
- int seqnum;
-
- public WeightedPhraseInfo( LinkedList terms, float boost ){
- this( terms, boost, 0 );
- }
-
- public WeightedPhraseInfo( LinkedList terms, float boost, int number ){
- this.boost = boost;
- this.seqnum = number;
- termsOffsets = new ArrayList( terms.size() );
- TermInfo ti = terms.get( 0 );
- termsOffsets.add( new Toffs( ti.getStartOffset(), ti.getEndOffset() ) );
- if( terms.size() == 1 ){
- text = ti.getText();
- return;
- }
- StringBuilder sb = new StringBuilder();
- sb.append( ti.getText() );
- int pos = ti.getPosition();
- for( int i = 1; i < terms.size(); i++ ){
- ti = terms.get( i );
- sb.append( ti.getText() );
- if( ti.getPosition() - pos == 1 ){
- Toffs to = termsOffsets.get( termsOffsets.size() - 1 );
- to.setEndOffset( ti.getEndOffset() );
- }
- else{
- termsOffsets.add( new Toffs( ti.getStartOffset(), ti.getEndOffset() ) );
- }
- pos = ti.getPosition();
- }
- text = sb.toString();
- }
-
- public int getStartOffset(){
- return termsOffsets.get( 0 ).startOffset;
- }
-
- public int getEndOffset(){
- return termsOffsets.get( termsOffsets.size() - 1 ).endOffset;
- }
-
- public boolean isOffsetOverlap( WeightedPhraseInfo other ){
- int so = getStartOffset();
- int eo = getEndOffset();
- int oso = other.getStartOffset();
- int oeo = other.getEndOffset();
- if( so <= oso && oso < eo ) return true;
- if( so < oeo && oeo <= eo ) return true;
- if( oso <= so && so < oeo ) return true;
- if( oso < eo && eo <= oeo ) return true;
- return false;
- }
-
- @Override
- public String toString(){
- StringBuilder sb = new StringBuilder();
- sb.append( text ).append( '(' ).append( boost ).append( ")(" );
- for( Toffs to : termsOffsets ){
- sb.append( to );
- }
- sb.append( ')' );
- return sb.toString();
- }
-
- public static class Toffs {
- int startOffset;
- int endOffset;
- public Toffs( int startOffset, int endOffset ){
- this.startOffset = startOffset;
- this.endOffset = endOffset;
- }
- public void setEndOffset( int endOffset ){
- this.endOffset = endOffset;
- }
- public int getStartOffset(){
- return startOffset;
- }
- public int getEndOffset(){
- return endOffset;
- }
- @Override
- public String toString(){
- StringBuilder sb = new StringBuilder();
- sb.append( '(' ).append( startOffset ).append( ',' ).append( endOffset ).append( ')' );
- return sb.toString();
- }
- }
- }
-}
Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FragmentsBuilder.java
===================================================================
--- lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FragmentsBuilder.java (revision 956773)
+++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FragmentsBuilder.java (working copy)
@@ -1,57 +0,0 @@
-package org.apache.lucene.search.vectorhighlight;
-
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.io.IOException;
-
-import org.apache.lucene.index.IndexReader;
-
-/**
- * FragmentsBuilder is an interface for fragments (snippets) builder classes.
- * A FragmentsBuilder class can be plugged in to Highlighter.
- */
-public interface FragmentsBuilder {
-
- /**
- * create a fragment.
- *
- * @param reader IndexReader of the index
- * @param docId document id to be highlighted
- * @param fieldName field of the document to be highlighted
- * @param fieldFragList FieldFragList object
- * @return a created fragment or null when no fragment created
- * @throws IOException
- */
- public String createFragment( IndexReader reader, int docId, String fieldName,
- FieldFragList fieldFragList ) throws IOException;
-
- /**
- * create multiple fragments.
- *
- * @param reader IndexReader of the index
- * @param docId document id to be highlighter
- * @param fieldName field of the document to be highlighted
- * @param fieldFragList FieldFragList object
- * @param maxNumFragments maximum number of fragments
- * @return created fragments or null when no fragments created.
- * size of the array can be less than maxNumFragments
- * @throws IOException
- */
- public String[] createFragments( IndexReader reader, int docId, String fieldName,
- FieldFragList fieldFragList, int maxNumFragments ) throws IOException;
-}
Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/SimpleFragListBuilder.java
===================================================================
--- lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/SimpleFragListBuilder.java (revision 956773)
+++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/SimpleFragListBuilder.java (working copy)
@@ -1,84 +0,0 @@
-package org.apache.lucene.search.vectorhighlight;
-
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.util.ArrayList;
-import java.util.Iterator;
-import java.util.List;
-
-import org.apache.lucene.search.vectorhighlight.FieldPhraseList.WeightedPhraseInfo;
-
-/**
- * A simple implementation of FragListBuilder.
- */
-public class SimpleFragListBuilder implements FragListBuilder {
-
- public static final int MARGIN = 6;
- public static final int MIN_FRAG_CHAR_SIZE = MARGIN * 3;
-
- public FieldFragList createFieldFragList(FieldPhraseList fieldPhraseList, int fragCharSize) {
- if( fragCharSize < MIN_FRAG_CHAR_SIZE )
- throw new IllegalArgumentException( "fragCharSize(" + fragCharSize + ") is too small. It must be " +
- MIN_FRAG_CHAR_SIZE + " or higher." );
-
- FieldFragList ffl = new FieldFragList( fragCharSize );
-
- List wpil = new ArrayList();
- Iterator ite = fieldPhraseList.phraseList.iterator();
- WeightedPhraseInfo phraseInfo = null;
- int startOffset = 0;
- boolean taken = false;
- while( true ){
- if( !taken ){
- if( !ite.hasNext() ) break;
- phraseInfo = ite.next();
- }
- taken = false;
- if( phraseInfo == null ) break;
-
- // if the phrase violates the border of previous fragment, discard it and try next phrase
- if( phraseInfo.getStartOffset() < startOffset ) continue;
-
- wpil.clear();
- wpil.add( phraseInfo );
- int st = phraseInfo.getStartOffset() - MARGIN < startOffset ?
- startOffset : phraseInfo.getStartOffset() - MARGIN;
- int en = st + fragCharSize;
- if( phraseInfo.getEndOffset() > en )
- en = phraseInfo.getEndOffset();
- startOffset = en;
-
- while( true ){
- if( ite.hasNext() ){
- phraseInfo = ite.next();
- taken = true;
- if( phraseInfo == null ) break;
- }
- else
- break;
- if( phraseInfo.getEndOffset() <= en )
- wpil.add( phraseInfo );
- else
- break;
- }
- ffl.add( st, en, wpil );
- }
- return ffl;
- }
-
-}
Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldFragList.java
===================================================================
--- lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldFragList.java (revision 956773)
+++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldFragList.java (working copy)
@@ -1,128 +0,0 @@
-package org.apache.lucene.search.vectorhighlight;
-
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.util.ArrayList;
-import java.util.List;
-
-import org.apache.lucene.search.vectorhighlight.FieldPhraseList.WeightedPhraseInfo;
-import org.apache.lucene.search.vectorhighlight.FieldPhraseList.WeightedPhraseInfo.Toffs;
-
-/**
- * FieldFragList has a list of "frag info" that is used by FragmentsBuilder class
- * to create fragments (snippets).
- */
-public class FieldFragList {
-
- List fragInfos = new ArrayList();
-
- /**
- * a constructor.
- *
- * @param fragCharSize the length (number of chars) of a fragment
- */
- public FieldFragList( int fragCharSize ){
- }
-
- /**
- * convert the list of WeightedPhraseInfo to WeightedFragInfo, then add it to the fragInfos
- *
- * @param startOffset start offset of the fragment
- * @param endOffset end offset of the fragment
- * @param phraseInfoList list of WeightedPhraseInfo objects
- */
- public void add( int startOffset, int endOffset, List phraseInfoList ){
- fragInfos.add( new WeightedFragInfo( startOffset, endOffset, phraseInfoList ) );
- }
-
- public static class WeightedFragInfo {
-
- List