Index: contrib/highlighter/build.xml =================================================================== --- contrib/highlighter/build.xml (revision 634668) +++ contrib/highlighter/build.xml (working copy) @@ -1,27 +1,29 @@ - + - - - Hits highlighter + Hits highlighter + + + + + + + + + + + + + + Highlighter building dependency ${memory.jar} + + + + + Index: contrib/highlighter/src/java/org/apache/lucene/search/highlight/SimpleSpanFragmenter.java =================================================================== --- contrib/highlighter/src/java/org/apache/lucene/search/highlight/SimpleSpanFragmenter.java (revision 0) +++ contrib/highlighter/src/java/org/apache/lucene/search/highlight/SimpleSpanFragmenter.java (revision 0) @@ -0,0 +1,95 @@ +package org.apache.lucene.search.highlight; + + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +import org.apache.lucene.analysis.Token; + +import java.util.List; + + +/** + * {@link Fragmenter} implementation which breaks text up into same-size + * fragments but does not split up Spans. This is a simple sample class. + */ +public class SimpleSpanFragmenter implements Fragmenter { + private static final int DEFAULT_FRAGMENT_SIZE = 100; + private int fragmentSize; + private int currentNumFrags; + private int position = -1; + private SpanScorer spanScorer; + private int waitForPos = -1; + + /** + * @param spanscorer SpanScorer that was used to score hits + */ + public SimpleSpanFragmenter(SpanScorer spanscorer) { + this(spanscorer, DEFAULT_FRAGMENT_SIZE); + } + + /** + * @param spanscorer SpanScorer that was used to score hits + * @param fragmentSize size in bytes of each fragment + */ + public SimpleSpanFragmenter(SpanScorer spanscorer, int fragmentSize) { + this.fragmentSize = fragmentSize; + this.spanScorer = spanscorer; + } + + /* (non-Javadoc) + * @see org.apache.lucene.search.highlight.Fragmenter#isNewFragment(org.apache.lucene.analysis.Token) + */ + public boolean isNewFragment(Token token) { + position += token.getPositionIncrement(); + + if (waitForPos == position) { + waitForPos = -1; + } else if (waitForPos != -1) { + return false; + } + + WeightedSpanTerm wSpanTerm = spanScorer.getWeightedSpanTerm(new String(token.termBuffer(), 0, token.termLength())); + + if (wSpanTerm != null) { + List positionSpans = wSpanTerm.getPositionSpans(); + + for (int i = 0; i < positionSpans.size(); i++) { + if (((PositionSpan) positionSpans.get(i)).start == position) { + waitForPos = ((PositionSpan) positionSpans.get(i)).end + 1; + + return true; + } + } + } + + boolean isNewFrag = token.endOffset() >= (fragmentSize * currentNumFrags); + + if (isNewFrag) { + currentNumFrags++; + } + + return isNewFrag; + } + + /* (non-Javadoc) + * @see org.apache.lucene.search.highlight.Fragmenter#start(java.lang.String) + */ + public void start(String originalText) { + position = 0; + currentNumFrags = 1; + } +} Index: contrib/highlighter/src/java/org/apache/lucene/search/highlight/spanscorer.html =================================================================== --- contrib/highlighter/src/java/org/apache/lucene/search/highlight/spanscorer.html (revision 0) +++ contrib/highlighter/src/java/org/apache/lucene/search/highlight/spanscorer.html (revision 0) @@ -0,0 +1,54 @@ + + +

+The spanscorer classes provide the Highlighter with the ability +to only highlight the Tokens that contributed to a query match. +The SpanScorer class is the central component and it will attempt to score Terms +based on whether they actually participated in scoring the Query. +

+

+The implementation is very similar to QueryScorer in that WeightedSpanTerms are extracted +from the given Query and then placed in a Map. During Token scoring, Terms found in +the Map return a score equal to their weight. The added wrinkle is that when terms are +extracted, the sub-queries that make up the Query are converted to SpanQuery's and +SpanQuery.getSpans() is applied to a MemoryIndex containing the TokenStream of the text to +be highlighted if the sub-query is position sensitive. The start and end positions of the +matching Spans are recorded with the respective WeightedSpanTerms and these positions are +then used to filter possible Token matches during scoring. +

+

Example Usage

+ +
+	IndexSearcher searcher = new IndexSearcher(ramDir);
+	Query query = QueryParser.parse("Kenne*", FIELD_NAME, analyzer);
+	query = query.rewrite(reader); //required to expand search terms
+	Hits hits = searcher.search(query);
+
+	for (int i = 0; i < hits.length(); i++)
+	{
+		String text = hits.doc(i).get(FIELD_NAME);
+		CachingTokenFilter tokenStream = new CachingTokenFilter(analyzer.tokenStream(
+                        FIELD_NAME, new StringReader(text)));
+        Highlighter highlighter = new Highlighter(new SpanScorer(query, FIELD_NAME, tokenStream));
+        tokenStream.reset();
+        
+        // Get 3 best fragments and seperate with a "..."
+		String result = highlighter.getBestFragments(tokenStream, text, 3, "...");
+		System.out.println(result);
+	}
+
+ +

+If you make a call to any of the getBestFragments() methods more than once, you must call reset() on the SpanScorer +between each call. +

+ +

The SpanScorer class has a constructor which can use an IndexReader to derive the IDF (inverse document frequency) +for each term in order to influence the score. This is useful for helping to extracting the most significant sections +of a document and in supplying scores used by the GradientFormatter to color significant words more strongly. +The SpanScorer.getMaxWeight method is useful when passed to the GradientFormatter constructor to define the top score +which is associated with the top color.

+ + + + Index: contrib/highlighter/src/java/org/apache/lucene/search/highlight/SpanScorer.java =================================================================== --- contrib/highlighter/src/java/org/apache/lucene/search/highlight/SpanScorer.java (revision 0) +++ contrib/highlighter/src/java/org/apache/lucene/search/highlight/SpanScorer.java (revision 0) @@ -0,0 +1,218 @@ +package org.apache.lucene.search.highlight; + +import java.io.IOException; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; + +import org.apache.lucene.analysis.CachingTokenFilter; +import org.apache.lucene.analysis.Token; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.search.Query; + + +/** + * {@link Scorer} implementation which scores text fragments by the number of + * unique query terms found. This class converts appropriate Querys to + * SpanQuerys and attempts to score only those terms that participated in + * generating the 'hit' on the document. + */ +public class SpanScorer implements Scorer { + private float totalScore; + private Set foundTerms; + private Map fieldWeightedSpanTerms; + private float maxTermWeight; + private int position = -1; + private String defaultField; + private boolean highlightCnstScrRngQuery; + + /** + * @param query + * Query to use for highlighting + * @param field + * Field to highlight - pass null to ignore fields + * @param tokenStream + * of source text to be highlighted + * @throws IOException + */ + public SpanScorer(Query query, String field, + CachingTokenFilter cachingTokenFilter) throws IOException { + init(query, field, cachingTokenFilter, null); + } + + /** + * @param query + * Query to use for highlighting + * @param field + * Field to highlight - pass null to ignore fields + * @param tokenStream + * of source text to be highlighted + * @param reader + * @throws IOException + */ + public SpanScorer(Query query, String field, + CachingTokenFilter cachingTokenFilter, IndexReader reader) + throws IOException { + init(query, field, cachingTokenFilter, reader); + } + + /** + * As above, but with ability to pass in an IndexReader + */ + public SpanScorer(Query query, String field, + CachingTokenFilter cachingTokenFilter, IndexReader reader, String defaultField) + throws IOException { + this.defaultField = defaultField.intern(); + init(query, field, cachingTokenFilter, reader); + } + + /** + * @param defaultField - The default field for queries with the field name unspecified + */ + public SpanScorer(Query query, String field, + CachingTokenFilter cachingTokenFilter, String defaultField) throws IOException { + this.defaultField = defaultField.intern(); + init(query, field, cachingTokenFilter, null); + } + + /** + * @param weightedTerms + */ + public SpanScorer(WeightedSpanTerm[] weightedTerms) { + this.fieldWeightedSpanTerms = new HashMap(weightedTerms.length); + + for (int i = 0; i < weightedTerms.length; i++) { + WeightedSpanTerm existingTerm = (WeightedSpanTerm) fieldWeightedSpanTerms.get(weightedTerms[i].term); + + if ((existingTerm == null) || + (existingTerm.weight < weightedTerms[i].weight)) { + // if a term is defined more than once, always use the highest + // scoring weight + fieldWeightedSpanTerms.put(weightedTerms[i].term, weightedTerms[i]); + maxTermWeight = Math.max(maxTermWeight, weightedTerms[i].getWeight()); + } + } + } + + /* + * (non-Javadoc) + * + * @see org.apache.lucene.search.highlight.Scorer#getFragmentScore() + */ + public float getFragmentScore() { + return totalScore; + } + + /** + * + * @return The highest weighted term (useful for passing to + * GradientFormatter to set top end of coloring scale. + */ + public float getMaxTermWeight() { + return maxTermWeight; + } + + /* + * (non-Javadoc) + * + * @see org.apache.lucene.search.highlight.Scorer#getTokenScore(org.apache.lucene.analysis.Token, + * int) + */ + public float getTokenScore(Token token) { + position += token.getPositionIncrement(); + String termText = new String(token.termBuffer(), 0, token.termLength()); + + WeightedSpanTerm weightedSpanTerm; + + if ((weightedSpanTerm = (WeightedSpanTerm) fieldWeightedSpanTerms.get( + termText)) == null) { + return 0; + } + + if (weightedSpanTerm.positionSensitive && + !weightedSpanTerm.checkPosition(position)) { + return 0; + } + + float score = weightedSpanTerm.getWeight(); + + // found a query term - is it unique in this doc? + if (!foundTerms.contains(termText)) { + totalScore += score; + foundTerms.add(termText); + } + + return score; + } + + /** + * Retrieve the WeightedSpanTerm for the specified token. Useful for passing + * Span information to a Fragmenter. + * + * @param token + * @return WeightedSpanTerm for token + */ + public WeightedSpanTerm getWeightedSpanTerm(String token) { + return (WeightedSpanTerm) fieldWeightedSpanTerms.get(token); + } + + /** + * @param query + * @param field + * @param tokenStream + * @param reader + * @throws IOException + */ + private void init(Query query, String field, + CachingTokenFilter cachingTokenFilter, IndexReader reader) + throws IOException { + WeightedSpanTermExtractor qse = defaultField == null ? new WeightedSpanTermExtractor() + : new WeightedSpanTermExtractor(defaultField); + + qse.setHighlightCnstScrRngQuery(highlightCnstScrRngQuery); + + if (reader == null) { + this.fieldWeightedSpanTerms = qse.getWeightedSpanTerms(query, + cachingTokenFilter, field); + } else { + this.fieldWeightedSpanTerms = qse.getWeightedSpanTermsWithScores(query, + cachingTokenFilter, field, reader); + } + } + + /** + * @return whether ConstantScoreRangeQuerys are set to be highlighted + */ + public boolean isHighlightCnstScrRngQuery() { + return highlightCnstScrRngQuery; + } + + /** + * If you call Highlighter#getBestFragment() more than once you must reset + * the SpanScorer between each call. + */ + public void reset() { + position = -1; + } + + /** + * Turns highlighting of ConstantScoreRangeQuery on/off. ConstantScoreRangeQuerys cannot be + * highlighted if you rewrite the query first. + * + * @param highlightCnstScrRngQuery + */ + public void setHighlightCnstScrRngQuery(boolean highlightCnstScrRngQuery) { + this.highlightCnstScrRngQuery = highlightCnstScrRngQuery; + } + + /* + * (non-Javadoc) + * + * @see org.apache.lucene.search.highlight.Scorer#startFragment(org.apache.lucene.search.highlight.TextFragment) + */ + public void startFragment(TextFragment newFragment) { + foundTerms = new HashSet(); + totalScore = 0; + } +} Index: contrib/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTerm.java =================================================================== --- contrib/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTerm.java (revision 0) +++ contrib/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTerm.java (revision 0) @@ -0,0 +1,104 @@ +package org.apache.lucene.search.highlight; + + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; + + +/** + * Lightweight class to hold term, weight, and positions used for scoring this + * term. + */ +public class WeightedSpanTerm extends WeightedTerm{ + boolean positionSensitive; + private List positionSpans = new ArrayList(); + + /** + * @param weight + * @param term + */ + public WeightedSpanTerm(float weight, String term) { + super(weight, term); + this.positionSpans = new ArrayList(); + } + + /** + * @param weight + * @param term + * @param positionSensitive + */ + public WeightedSpanTerm(float weight, String term, boolean positionSensitive) { + super(weight, term); + this.positionSensitive = positionSensitive; + } + + /** + * Checks to see if this term is valid at position. + * + * @param position + * to check against valid term postions + * @return true iff this term is a hit at this position + */ + public boolean checkPosition(int position) { + // There would probably be a slight speed improvement if PositionSpans + // where kept in some sort of priority queue - that way this method + // could + // bail early without checking each PositionSpan. + Iterator positionSpanIt = positionSpans.iterator(); + + while (positionSpanIt.hasNext()) { + PositionSpan posSpan = (PositionSpan) positionSpanIt.next(); + + if (((position >= posSpan.start) && (position <= posSpan.end))) { + return true; + } + } + + return false; + } + + public void addPositionSpans(List positionSpans) { + this.positionSpans.addAll(positionSpans); + } + + public boolean isPositionSensitive() { + return positionSensitive; + } + + public void setPositionSensitive(boolean positionSensitive) { + this.positionSensitive = positionSensitive; + } + + public List getPositionSpans() { + return positionSpans; + } +} + + +// Utility class to store a Span +class PositionSpan { + int start; + int end; + + public PositionSpan(int start, int end) { + this.start = start; + this.end = end; + } +} Index: contrib/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java =================================================================== --- contrib/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java (revision 0) +++ contrib/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java (revision 0) @@ -0,0 +1,436 @@ +package org.apache.lucene.search.highlight; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import org.apache.lucene.analysis.CachingTokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.index.FilterIndexReader; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.TermEnum; +import org.apache.lucene.index.memory.MemoryIndex; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.ConstantScoreRangeQuery; +import org.apache.lucene.search.DisjunctionMaxQuery; +import org.apache.lucene.search.FilteredQuery; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.MultiPhraseQuery; +import org.apache.lucene.search.PhraseQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.spans.SpanNearQuery; +import org.apache.lucene.search.spans.SpanOrQuery; +import org.apache.lucene.search.spans.SpanQuery; +import org.apache.lucene.search.spans.SpanTermQuery; +import org.apache.lucene.search.spans.Spans; + +/** + * Class used to extract {@link WeightedSpanTerm}s from a {@link Query} based on whether Terms from the query are contained in a supplied TokenStream. + */ +public class WeightedSpanTermExtractor { + + private String fieldName; + private CachingTokenFilter cachedTokenFilter; + private Map readers = new HashMap(10); // Map + private String defaultField; + private boolean highlightCnstScrRngQuery; + + public WeightedSpanTermExtractor() { + } + + public WeightedSpanTermExtractor(String defaultField) { + if (defaultField != null) { + this.defaultField = defaultField.intern(); + } + } + + private void closeReaders() { + Collection readerSet = readers.values(); + Iterator it = readerSet.iterator(); + + while (it.hasNext()) { + IndexReader reader = (IndexReader) it.next(); + try { + reader.close(); + } catch (IOException e) { + // alert? + } + } + } + + /** + * Fills a Map with <@link WeightedSpanTerm>s using the terms from the supplied Query. + * + * @param query + * Query to extract Terms from + * @param terms + * Map to place created WeightedSpanTerms in + * @throws IOException + */ + private void extract(Query query, Map terms) throws IOException { + if (query instanceof BooleanQuery) { + BooleanClause[] queryClauses = ((BooleanQuery) query).getClauses(); + Map booleanTerms = new HashMap(); + for (int i = 0; i < queryClauses.length; i++) { + if (!queryClauses[i].isProhibited()) { + extract(queryClauses[i].getQuery(), booleanTerms); + } + } + terms.putAll(booleanTerms); + } else if (query instanceof PhraseQuery) { + Term[] phraseQueryTerms = ((PhraseQuery) query).getTerms(); + SpanQuery[] clauses = new SpanQuery[phraseQueryTerms.length]; + for (int i = 0; i < phraseQueryTerms.length; i++) { + clauses[i] = new SpanTermQuery(phraseQueryTerms[i]); + } + + int slop = ((PhraseQuery) query).getSlop(); + boolean inorder = false; + + if (slop == 0) { + inorder = true; + } + + SpanNearQuery sp = new SpanNearQuery(clauses, slop, inorder); + sp.setBoost(query.getBoost()); + extractWeightedSpanTerms(terms, sp); + } else if (query instanceof TermQuery) { + extractWeightedTerms(terms, query); + } else if (query instanceof SpanQuery) { + extractWeightedSpanTerms(terms, (SpanQuery) query); + } else if (query instanceof FilteredQuery) { + extract(((FilteredQuery) query).getQuery(), terms); + } else if (query instanceof DisjunctionMaxQuery) { + Map disjunctTerms = new HashMap(); + for (Iterator iterator = ((DisjunctionMaxQuery) query).iterator(); iterator.hasNext();) { + extract((Query) iterator.next(), disjunctTerms); + } + terms.putAll(disjunctTerms); + } else if (query instanceof MultiPhraseQuery) { + final MultiPhraseQuery mpq = (MultiPhraseQuery) query; + final List termArrays = mpq.getTermArrays(); + final int[] positions = mpq.getPositions(); + if (positions.length > 0) { + + int maxPosition = positions[positions.length - 1]; + for (int i = 0; i < positions.length - 1; ++i) { + if (positions[i] > maxPosition) { + maxPosition = positions[i]; + } + } + + final List[] disjunctLists = new List[maxPosition + 1]; + int distinctPositions = 0; + + for (int i = 0; i < termArrays.size(); ++i) { + final Term[] termArray = (Term[]) termArrays.get(i); + List disjuncts = disjunctLists[positions[i]]; + if (disjuncts == null) { + disjuncts = (disjunctLists[positions[i]] = new ArrayList(termArray.length)); + ++distinctPositions; + } + for (int j = 0; j < termArray.length; ++j) { + disjuncts.add(new SpanTermQuery(termArray[j])); + } + } + + int positionGaps = 0; + int position = 0; + final SpanQuery[] clauses = new SpanQuery[distinctPositions]; + for (int i = 0; i < disjunctLists.length; ++i) { + List disjuncts = disjunctLists[i]; + if (disjuncts != null) { + clauses[position++] = new SpanOrQuery((SpanQuery[]) disjuncts + .toArray(new SpanQuery[disjuncts.size()])); + } else { + ++positionGaps; + } + } + + final int slop = mpq.getSlop(); + final boolean inorder = (slop == 0); + + SpanNearQuery sp = new SpanNearQuery(clauses, slop + positionGaps, inorder); + sp.setBoost(query.getBoost()); + extractWeightedSpanTerms(terms, sp); + } + } else if (query instanceof ConstantScoreRangeQuery) { + ConstantScoreRangeQuery q = (ConstantScoreRangeQuery) query; + Term lower = new Term(fieldName, q.getLowerVal()); + Term upper = new Term(fieldName, q.getUpperVal()); + FilterIndexReader fir = new FilterIndexReader(getReaderForField(fieldName)); + try { + TermEnum te = fir.terms(lower); + BooleanQuery bq = new BooleanQuery(); + do { + Term term = te.term(); + if (term != null && upper.compareTo(term) >= 0) { + bq.add(new BooleanClause(new TermQuery(term), BooleanClause.Occur.SHOULD)); + } else { + break; + } + } while (te.next()); + extract(bq, terms); + } finally { + fir.close(); + } + } else { + // NO-OP + System.out.println("found none"); + } + } + + /** + * Fills a Map with <@link WeightedSpanTerm>s using the terms from the supplied SpanQuery. + * + * @param terms + * Map to place created WeightedSpanTerms in + * @param spanQuery + * SpanQuery to extract Terms from + * @throws IOException + */ + private void extractWeightedSpanTerms(Map terms, SpanQuery spanQuery) throws IOException { + Set nonWeightedTerms = new HashSet(); + spanQuery.extractTerms(nonWeightedTerms); + + Set fieldNames; + + if (fieldName == null) { + fieldNames = new HashSet(); + for (Iterator iter = nonWeightedTerms.iterator(); iter.hasNext();) { + Term queryTerm = (Term) iter.next(); + fieldNames.add(queryTerm.field()); + } + } else { + fieldNames = new HashSet(1); + fieldNames.add(fieldName); + } + // To support the use of the default field name + if (defaultField != null) { + fieldNames.add(defaultField); + } + + Iterator it = fieldNames.iterator(); + List spanPositions = new ArrayList(); + + while (it.hasNext()) { + String field = (String) it.next(); + + IndexReader reader = getReaderForField(field); + Spans spans = spanQuery.getSpans(reader); + + // collect span positions + while (spans.next()) { + spanPositions.add(new PositionSpan(spans.start(), spans.end() - 1)); + } + + cachedTokenFilter.reset(); + } + + if (spanPositions.size() == 0) { + // no spans found + return; + } + + for (Iterator iter = nonWeightedTerms.iterator(); iter.hasNext();) { + Term queryTerm = (Term) iter.next(); + + if (fieldNameComparator(queryTerm.field())) { + WeightedSpanTerm weightedSpanTerm = (WeightedSpanTerm) terms.get(queryTerm.text()); + + if (weightedSpanTerm == null) { + weightedSpanTerm = new WeightedSpanTerm(spanQuery.getBoost(), queryTerm.text()); + weightedSpanTerm.addPositionSpans(spanPositions); + weightedSpanTerm.positionSensitive = true; + terms.put(queryTerm.text(), weightedSpanTerm); + } else { + if (spanPositions.size() > 0) { + weightedSpanTerm.addPositionSpans(spanPositions); + weightedSpanTerm.positionSensitive = true; + } + } + } + } + } + + /** + * Fills a Map with <@link WeightedSpanTerm>s using the terms from the supplied Query. + * + * @param terms + * Map to place created WeightedSpanTerms in + * @param query + * Query to extract Terms from + * @throws IOException + */ + private void extractWeightedTerms(Map terms, Query query) throws IOException { + Set nonWeightedTerms = new HashSet(); + query.extractTerms(nonWeightedTerms); + + for (Iterator iter = nonWeightedTerms.iterator(); iter.hasNext();) { + Term queryTerm = (Term) iter.next(); + + if (fieldNameComparator(queryTerm.field())) { + WeightedSpanTerm weightedSpanTerm = new WeightedSpanTerm(query.getBoost(), queryTerm.text()); + terms.put(queryTerm.text(), weightedSpanTerm); + } + } + } + + /** + * Necessary to implement matches for queries against defaultField + */ + private boolean fieldNameComparator(String fieldNameToCheck) { + boolean rv = fieldName == null || fieldNameToCheck == fieldName + || fieldNameToCheck == defaultField; + return rv; + } + + private IndexReader getReaderForField(String field) { + IndexReader reader = (IndexReader) readers.get(field); + if (reader == null) { + MemoryIndex indexer = new MemoryIndex(); + indexer.addField(field, cachedTokenFilter); + IndexSearcher searcher = indexer.createSearcher(); + reader = searcher.getIndexReader(); + readers.put(field, reader); + } + return reader; + } + + /** + * Creates a Map of WeightedSpanTerms from the given Query and TokenStream. + * + *

+ * + * @param query + * that caused hit + * @param tokenStream + * of text to be highlighted + * @return + * @throws IOException + */ + public Map getWeightedSpanTerms(Query query, CachingTokenFilter cachingTokenFilter) + throws IOException { + this.fieldName = null; + this.cachedTokenFilter = cachingTokenFilter; + + Map terms = new HashMap(); + try { + extract(query, terms); + } finally { + closeReaders(); + } + + return terms; + } + + /** + * Creates a Map of WeightedSpanTerms from the given Query and TokenStream. + * + *

+ * + * @param query + * that caused hit + * @param tokenStream + * of text to be highlighted + * @param fieldName + * restricts Term's used based on field name + * @return + * @throws IOException + */ + public Map getWeightedSpanTerms(Query query, CachingTokenFilter cachingTokenFilter, + String fieldName) throws IOException { + if (fieldName != null) { + this.fieldName = fieldName.intern(); + } + + Map terms = new HashMap(); + this.cachedTokenFilter = cachingTokenFilter; + try { + extract(query, terms); + } finally { + closeReaders(); + } + + return terms; + } + + /** + * Creates a Map of WeightedSpanTerms from the given Query and TokenStream. Uses a supplied + * IndexReader to properly weight terms (for gradient highlighting). + * + *

+ * + * @param query + * that caused hit + * @param tokenStream + * of text to be highlighted + * @param fieldName + * restricts Term's used based on field name + * @param reader + * to use for scoring + * @return + * @throws IOException + */ + public Map getWeightedSpanTermsWithScores(Query query, TokenStream tokenStream, String fieldName, + IndexReader reader) throws IOException { + this.fieldName = fieldName; + this.cachedTokenFilter = new CachingTokenFilter(tokenStream); + + Map terms = new HashMap(); + extract(query, terms); + + int totalNumDocs = reader.numDocs(); + Set weightedTerms = terms.keySet(); + Iterator it = weightedTerms.iterator(); + + try { + while (it.hasNext()) { + WeightedSpanTerm weightedSpanTerm = (WeightedSpanTerm) terms.get(it.next()); + int docFreq = reader.docFreq(new Term(fieldName, weightedSpanTerm.term)); + + // IDF algorithm taken from DefaultSimilarity class + float idf = (float) (Math.log((float) totalNumDocs / (double) (docFreq + 1)) + 1.0); + weightedSpanTerm.weight *= idf; + } + } finally { + + closeReaders(); + } + + return terms; + } + + public boolean isHighlightCnstScrRngQuery() { + return highlightCnstScrRngQuery; + } + + public void setHighlightCnstScrRngQuery(boolean highlightCnstScrRngQuery) { + this.highlightCnstScrRngQuery = highlightCnstScrRngQuery; + } +} Index: contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java =================================================================== --- contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java (revision 634668) +++ contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java (working copy) @@ -21,32 +21,46 @@ import java.io.IOException; import java.io.Reader; import java.io.StringReader; -import java.util.*; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.StringTokenizer; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import junit.framework.TestCase; -import org.apache.lucene.analysis.*; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.CachingTokenFilter; +import org.apache.lucene.analysis.LowerCaseTokenizer; +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.WhitespaceAnalyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.Term; -import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.queryParser.QueryParser; +import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.FilteredQuery; import org.apache.lucene.search.Hits; import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.MultiPhraseQuery; import org.apache.lucene.search.MultiSearcher; import org.apache.lucene.search.PhraseQuery; import org.apache.lucene.search.Query; import org.apache.lucene.search.RangeFilter; import org.apache.lucene.search.Searcher; import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.BooleanClause.Occur; +import org.apache.lucene.search.highlight.SynonymTokenizer.TestHighlightRunner; import org.apache.lucene.search.spans.SpanNearQuery; +import org.apache.lucene.search.spans.SpanNotQuery; import org.apache.lucene.search.spans.SpanQuery; import org.apache.lucene.search.spans.SpanTermQuery; import org.apache.lucene.store.RAMDirectory; @@ -55,864 +69,1389 @@ /** * JUnit Test for Highlighter class. + * * @author mark@searcharea.co.uk */ -public class HighlighterTest extends TestCase implements Formatter -{ - private IndexReader reader; - private static final String FIELD_NAME = "contents"; - private Query query; - RAMDirectory ramDir; - public Searcher searcher = null; - public Hits hits = null; - int numHighlights = 0; - Analyzer analyzer=new StandardAnalyzer(); +public class HighlighterTest extends TestCase implements Formatter { + private IndexReader reader; + static final String FIELD_NAME = "contents"; + private Query query; + RAMDirectory ramDir; + public Searcher searcher = null; + public Hits hits = null; + int numHighlights = 0; + Analyzer analyzer = new StandardAnalyzer(); - String texts[] = - { - "Hello this is a piece of text that is very long and contains too much preamble and the meat is really here which says kennedy has been shot", - "This piece of text refers to Kennedy at the beginning then has a longer piece of text that is very long in the middle and finally ends with another reference to Kennedy", - "JFK has been shot", - "John Kennedy has been shot", - "This text has a typo in referring to Keneddy" }; + String[] texts = { + "Hello this is a piece of text that is very long and contains too much preamble and the meat is really here which says kennedy has been shot", + "This piece of text refers to Kennedy at the beginning then has a longer piece of text that is very long in the middle and finally ends with another reference to Kennedy", + "JFK has been shot", "John Kennedy has been shot", + "This text has a typo in referring to Keneddy", + "wordx wordy wordz wordx wordy wordx worda wordb wordy wordc", "y z x y z a b" }; - /** - * Constructor for HighlightExtractorTest. - * @param arg0 - */ - public HighlighterTest(String arg0) - { - super(arg0); - } + /** + * Constructor for HighlightExtractorTest. + * + * @param arg0 + */ + public HighlighterTest(String arg0) { + super(arg0); + } - public void testSimpleHighlighter() throws Exception - { - doSearching("Kennedy"); - Highlighter highlighter = new Highlighter(new QueryScorer(query)); - highlighter.setTextFragmenter(new SimpleFragmenter(40)); - int maxNumFragmentsRequired = 2; - for (int i = 0; i < hits.length(); i++) - { - String text = hits.doc(i).get(FIELD_NAME); - TokenStream tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(text)); + public void testHighlightingWithDefaultField() throws Exception { - String result = - highlighter.getBestFragments(tokenStream,text,maxNumFragmentsRequired, "..."); - System.out.println("\t" + result); - } - //Not sure we can assert anything here - just running to check we dont throw any exceptions - } + String s1 = "I call our world Flatland, not because we call it so,"; + QueryParser parser = new QueryParser(FIELD_NAME, new StandardAnalyzer()); + // Verify that a query against the default field results in text being + // highlighted + // regardless of the field name. + Query q = parser.parse("\"world Flatland\"~3"); + String expected = "I call our world Flatland, not because we call it so,"; + String observed = highlightField(q, "SOME_FIELD_NAME", s1); + System.out.println("Expected: \"" + expected + "\n" + "Observed: \"" + observed); + assertEquals("Query in the default field results in text for *ANY* field being highlighted", + expected, observed); - public void testGetBestFragmentsSimpleQuery() throws Exception - { - doSearching("Kennedy"); - doStandardHighlights(); - assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 4); - } - public void testGetFuzzyFragments() throws Exception - { - doSearching("Kinnedy~"); - doStandardHighlights(); - assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 5); - } + // Verify that a query against a named field does not result in any + // highlighting + // when the query field name differs from the name of the field being + // highlighted, + // which in this example happens to be the default field name. + q = parser.parse("text:\"world Flatland\"~3"); + expected = s1; + observed = highlightField(q, FIELD_NAME, s1); + System.out.println("Expected: \"" + expected + "\n" + "Observed: \"" + observed); + assertEquals( + "Query in a named field does not result in highlighting when that field isn't in the query", + s1, highlightField(q, FIELD_NAME, s1)); + } - public void testGetWildCardFragments() throws Exception - { - doSearching("K?nnedy"); - doStandardHighlights(); - assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 4); - } - public void testGetMidWildCardFragments() throws Exception - { - doSearching("K*dy"); - doStandardHighlights(); - assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 5); - } - public void testGetRangeFragments() throws Exception - { - String queryString=FIELD_NAME + ":[kannedy TO kznnedy]"; - - //Need to explicitly set the QueryParser property to use RangeQuery rather than RangeFilters - QueryParser parser=new QueryParser(FIELD_NAME, new StandardAnalyzer()); - parser.setUseOldRangeQuery(true); - query = parser.parse(queryString); - doSearching(query); - - doStandardHighlights(); - assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 5); - } + /** + * This method intended for use with testHighlightingWithDefaultField() + */ + private static String highlightField(Query query, String fieldName, String text) + throws IOException { + CachingTokenFilter tokenStream = new CachingTokenFilter(new StandardAnalyzer().tokenStream( + fieldName, new StringReader(text))); + // Assuming "", "" used to highlight + SimpleHTMLFormatter formatter = new SimpleHTMLFormatter(); + Highlighter highlighter = new Highlighter(formatter, new SpanScorer(query, fieldName, + tokenStream, FIELD_NAME)); + highlighter.setTextFragmenter(new SimpleFragmenter(Integer.MAX_VALUE)); + tokenStream.reset(); + String rv = highlighter.getBestFragments(tokenStream, text, 1, "(FIELD TEXT TRUNCATED)"); + return rv.length() == 0 ? text : rv; + } - public void testGetBestFragmentsPhrase() throws Exception - { - doSearching("\"John Kennedy\""); - doStandardHighlights(); - //Currently highlights "John" and "Kennedy" separately - assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 2); - } - public void testGetBestFragmentsSpan() throws Exception - { - SpanQuery clauses[]={ - new SpanTermQuery(new Term("contents","john")), - new SpanTermQuery(new Term("contents","kennedy")), - }; - - SpanNearQuery snq=new SpanNearQuery(clauses,1,true); - doSearching(snq); - doStandardHighlights(); - //Currently highlights "John" and "Kennedy" separately - assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 2); - } + public void testSimpleSpanHighlighter() throws Exception { + doSearching("Kennedy"); - public void testOffByOne() throws IOException - { - TermQuery query= new TermQuery( new Term( "data", "help" )); - Highlighter hg = new Highlighter(new SimpleHTMLFormatter(), new QueryScorer( query )); - hg.setTextFragmenter( new NullFragmenter() ); + int maxNumFragmentsRequired = 2; - String match = null; - match = hg.getBestFragment( new StandardAnalyzer(), "data", "help me [54-65]"); - assertEquals("help me [54-65]", match); - } - public void testGetBestFragmentsFilteredQuery() throws Exception - { - RangeFilter rf=new RangeFilter("contents","john","john",true,true); - SpanQuery clauses[]={ - new SpanTermQuery(new Term("contents","john")), - new SpanTermQuery(new Term("contents","kennedy")), - }; - SpanNearQuery snq=new SpanNearQuery(clauses,1,true); - FilteredQuery fq=new FilteredQuery(snq,rf); - - doSearching(fq); - doStandardHighlights(); - //Currently highlights "John" and "Kennedy" separately - assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 2); - } - public void testGetBestFragmentsFilteredPhraseQuery() throws Exception - { - RangeFilter rf=new RangeFilter("contents","john","john",true,true); - PhraseQuery pq=new PhraseQuery(); - pq.add(new Term("contents","john")); - pq.add(new Term("contents","kennedy")); - FilteredQuery fq=new FilteredQuery(pq,rf); - - doSearching(fq); - doStandardHighlights(); - //Currently highlights "John" and "Kennedy" separately - assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 2); - } + for (int i = 0; i < hits.length(); i++) { + String text = hits.doc(i).get(FIELD_NAME); + CachingTokenFilter tokenStream = new CachingTokenFilter(analyzer.tokenStream(FIELD_NAME, + new StringReader(text))); + Highlighter highlighter = new Highlighter(new SpanScorer(query, FIELD_NAME, tokenStream)); + highlighter.setTextFragmenter(new SimpleFragmenter(40)); + tokenStream.reset(); - public void testGetBestFragmentsMultiTerm() throws Exception - { - doSearching("John Kenn*"); - doStandardHighlights(); - assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 5); - } - public void testGetBestFragmentsWithOr() throws Exception - { - doSearching("JFK OR Kennedy"); - doStandardHighlights(); - assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 5); - } + String result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired, + "..."); + System.out.println("\t" + result); + } - public void testGetBestSingleFragment() throws Exception - { - doSearching("Kennedy"); - Highlighter highlighter =new Highlighter(this,new QueryScorer(query)); - highlighter.setTextFragmenter(new SimpleFragmenter(40)); + // Not sure we can assert anything here - just running to check we dont + // throw any exceptions + } - for (int i = 0; i < hits.length(); i++) - { - String text = hits.doc(i).get(FIELD_NAME); - TokenStream tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(text)); - String result = highlighter.getBestFragment(tokenStream,text); - System.out.println("\t" + result); - } - assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 4); + public void testSimpleSpanPhraseHighlighting() throws Exception { + doSearching("\"very long and contains\""); - numHighlights = 0; - for (int i = 0; i < hits.length(); i++) - { - String text = hits.doc(i).get(FIELD_NAME); - highlighter.getBestFragment(analyzer, FIELD_NAME,text); - } - assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 4); + int maxNumFragmentsRequired = 2; - numHighlights = 0; - for (int i = 0; i < hits.length(); i++) - { - String text = hits.doc(i).get(FIELD_NAME); - highlighter.getBestFragments(analyzer,FIELD_NAME, text, 10); - } - assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 4); + for (int i = 0; i < hits.length(); i++) { + String text = hits.doc(i).get(FIELD_NAME); + CachingTokenFilter tokenStream = new CachingTokenFilter(analyzer.tokenStream(FIELD_NAME, + new StringReader(text))); + Highlighter highlighter = new Highlighter(this, + new SpanScorer(query, FIELD_NAME, tokenStream)); + highlighter.setTextFragmenter(new SimpleFragmenter(40)); + tokenStream.reset(); - } + String result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired, + "..."); + System.out.println("\t" + result); + } - public void testGetBestSingleFragmentWithWeights() throws Exception - { - WeightedTerm[]wTerms=new WeightedTerm[2]; - wTerms[0]=new WeightedTerm(10f,"hello"); - wTerms[1]=new WeightedTerm(1f,"kennedy"); - Highlighter highlighter =new Highlighter(new QueryScorer(wTerms)); - TokenStream tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(texts[0])); - highlighter.setTextFragmenter(new SimpleFragmenter(2)); + assertTrue("Failed to find correct number of highlights " + numHighlights + " found", + numHighlights == 3); + } - String result = highlighter.getBestFragment(tokenStream,texts[0]).trim(); - assertTrue("Failed to find best section using weighted terms. Found: ["+result+"]" - , "Hello".equals(result)); + public void testSimpleSpanPhraseHighlighting2() throws Exception { + doSearching("\"text piece long\"~5"); - //readjust weights - wTerms[1].setWeight(50f); - tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(texts[0])); - highlighter =new Highlighter(new QueryScorer(wTerms)); - highlighter.setTextFragmenter(new SimpleFragmenter(2)); + int maxNumFragmentsRequired = 2; - result = highlighter.getBestFragment(tokenStream,texts[0]).trim(); - assertTrue("Failed to find best section using weighted terms. Found: "+result - , "kennedy".equals(result)); - } - - - // tests a "complex" analyzer that produces multiple - // overlapping tokens - public void testOverlapAnalyzer() throws Exception - { - HashMap synonyms = new HashMap(); - synonyms.put("football", "soccer,footie"); - Analyzer analyzer = new SynonymAnalyzer(synonyms); - String srchkey = "football"; + for (int i = 0; i < hits.length(); i++) { + String text = hits.doc(i).get(FIELD_NAME); + CachingTokenFilter tokenStream = new CachingTokenFilter(analyzer.tokenStream(FIELD_NAME, + new StringReader(text))); + Highlighter highlighter = new Highlighter(this, + new SpanScorer(query, FIELD_NAME, tokenStream)); + highlighter.setTextFragmenter(new SimpleFragmenter(40)); + tokenStream.reset(); - String s = "football-soccer in the euro 2004 footie competition"; - QueryParser parser=new QueryParser("bookid",analyzer); - Query query = parser.parse(srchkey); + String result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired, + "..."); + System.out.println("\t" + result); + } - Highlighter highlighter = new Highlighter(new QueryScorer(query)); - TokenStream tokenStream = - analyzer.tokenStream(null, new StringReader(s)); - // Get 3 best fragments and seperate with a "..." - String result = highlighter.getBestFragments(tokenStream, s, 3, "..."); - String expectedResult="football-soccer in the euro 2004 footie competition"; - assertTrue("overlapping analyzer should handle highlights OK",expectedResult.equals(result)); - } + assertTrue("Failed to find correct number of highlights " + numHighlights + " found", + numHighlights == 6); + } + public void testSimpleSpanPhraseHighlighting3() throws Exception { + doSearching("\"x y z\""); - public void testGetSimpleHighlight() throws Exception - { - doSearching("Kennedy"); - Highlighter highlighter = - new Highlighter(this,new QueryScorer(query)); + int maxNumFragmentsRequired = 2; - for (int i = 0; i < hits.length(); i++) - { - String text = hits.doc(i).get(FIELD_NAME); - TokenStream tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(text)); + for (int i = 0; i < hits.length(); i++) { + String text = hits.doc(i).get(FIELD_NAME); + CachingTokenFilter tokenStream = new CachingTokenFilter(analyzer.tokenStream(FIELD_NAME, + new StringReader(text))); + Highlighter highlighter = new Highlighter(this, + new SpanScorer(query, FIELD_NAME, tokenStream)); + highlighter.setTextFragmenter(new SimpleFragmenter(40)); + tokenStream.reset(); - String result = highlighter.getBestFragment(tokenStream,text); - System.out.println("\t" + result); - } - assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 4); - } + String result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired, + "..."); + System.out.println("\t" + result); + assertTrue("Failed to find correct number of highlights " + numHighlights + " found", + numHighlights == 3); + } + } - public void testGetTextFragments() throws Exception - { - doSearching("Kennedy"); - Highlighter highlighter = - new Highlighter(this,new QueryScorer(query)); - highlighter.setTextFragmenter(new SimpleFragmenter(20)); + public void testSpanMultiPhraseQueryHighlighting() throws Exception { + MultiPhraseQuery mpq = new MultiPhraseQuery(); - for (int i = 0; i < hits.length(); i++) - { - String text = hits.doc(i).get(FIELD_NAME); - TokenStream tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(text)); + mpq.add(new Term[] { new Term(FIELD_NAME, "wordx"), new Term(FIELD_NAME, "wordb") }); + mpq.add(new Term(FIELD_NAME, "wordy")); - String stringResults[] = highlighter.getBestFragments(tokenStream,text,10); + doSearching(mpq); - tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(text)); - TextFragment fragmentResults[] = highlighter.getBestTextFragments(tokenStream,text,true,10); + final int maxNumFragmentsRequired = 2; + assertExpectedHighlightCount(maxNumFragmentsRequired, 6); + } - assertTrue("Failed to find correct number of text Fragments: " + - fragmentResults.length + " vs "+ stringResults.length, fragmentResults.length==stringResults.length); - for (int j = 0; j < stringResults.length; j++) - { - System.out.println(fragmentResults[j]); - assertTrue("Failed to find same text Fragments: " + - fragmentResults[j] + " found", fragmentResults[j].toString().equals(stringResults[j])); - - } - - } - } + public void testSpanMultiPhraseQueryHighlightingWithGap() throws Exception { + MultiPhraseQuery mpq = new MultiPhraseQuery(); - public void testMaxSizeHighlight() throws Exception - { - doSearching("meat"); - Highlighter highlighter = - new Highlighter(this,new QueryScorer(query)); - highlighter.setMaxDocBytesToAnalyze(30); - TokenStream tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(texts[0])); - highlighter.getBestFragment(tokenStream,texts[0]); - assertTrue("Setting MaxDocBytesToAnalyze should have prevented " + - "us from finding matches for this record: " + numHighlights + - " found", numHighlights == 0); - } - public void testMaxSizeHighlightTruncates() throws IOException - { - String goodWord="goodtoken"; - String stopWords[]={"stoppedtoken"}; - - TermQuery query= new TermQuery( new Term( "data", goodWord )); - SimpleHTMLFormatter fm=new SimpleHTMLFormatter(); - Highlighter hg = new Highlighter(fm, new QueryScorer( query )); - hg.setTextFragmenter( new NullFragmenter() ); + /* + * The toString of MultiPhraseQuery doesn't work so well with these + * out-of-order additions, but the Query itself seems to match accurately. + */ - String match = null; - StringBuffer sb=new StringBuffer(); - sb.append(goodWord); - for(int i=0;i<10000;i++) - { - sb.append(" "); - sb.append(stopWords[0]); - } - - hg.setMaxDocBytesToAnalyze(100); - match = hg.getBestFragment( new StandardAnalyzer(stopWords), "data", sb.toString()); - assertTrue("Matched text should be no more than 100 chars in length ", - match.length()4\" claims article"; - //run the highlighter on the raw content (scorer does not score any tokens for - // highlighting but scores a single fragment for selection - Highlighter highlighter = new Highlighter(this, - new SimpleHTMLEncoder(), new Scorer() - { - public void startFragment(TextFragment newFragment) - { - } - public float getTokenScore(Token token) - { - return 0; - } - public float getFragmentScore() - { - return 1; - } - }); - highlighter.setTextFragmenter(new SimpleFragmenter(2000)); - TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, - new StringReader(rawDocContent)); + public void run() throws Exception { + mode = SPAN; + doStandardHighlights(analyzer, hits, query, HighlighterTest.this); + } + }; - String encodedSnippet = highlighter.getBestFragments(tokenStream, rawDocContent,1,""); - //An ugly bit of XML creation: - String xhtml="\n"+ - "\n"+ - "\n"+ - "\n"+ - "My Test HTML Document\n"+ - "\n"+ - "\n"+ - "

"+encodedSnippet+"

\n"+ - "\n"+ - ""; - //now an ugly built of XML parsing to test the snippet is encoded OK - DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance(); - DocumentBuilder db = dbf.newDocumentBuilder(); - org.w3c.dom.Document doc = db.parse(new ByteArrayInputStream(xhtml.getBytes())); - Element root=doc.getDocumentElement(); - NodeList nodes=root.getElementsByTagName("body"); - Element body=(Element) nodes.item(0); - nodes=body.getElementsByTagName("h2"); - Element h2=(Element) nodes.item(0); - String decodedSnippet=h2.getFirstChild().getNodeValue(); - assertEquals("XHTML Encoding should have worked:", rawDocContent,decodedSnippet); + helper.run(); + assertTrue("Failed to find correct number of highlights " + numHighlights + " found", + numHighlights == 7); + } + + public void testNotSpanSimpleQuery() throws Exception { + doSearching(new SpanNotQuery(new SpanNearQuery(new SpanQuery[] { + new SpanTermQuery(new Term(FIELD_NAME, "shot")), + new SpanTermQuery(new Term(FIELD_NAME, "kennedy")) }, 3, false), new SpanTermQuery( + new Term(FIELD_NAME, "john")))); + TestHighlightRunner helper = new TestHighlightRunner() { + + public void run() throws Exception { + mode = SPAN; + doStandardHighlights(analyzer, hits, query, HighlighterTest.this); + } + }; + + helper.run(); + assertTrue("Failed to find correct number of highlights " + numHighlights + " found", + numHighlights == 4); + } + + public void testGetBestFragmentsSimpleQuery() throws Exception { + TestHighlightRunner helper = new TestHighlightRunner() { + + public void run() throws Exception { + numHighlights = 0; + doSearching("Kennedy"); + doStandardHighlights(analyzer, hits, query, HighlighterTest.this); + assertTrue("Failed to find correct number of highlights " + numHighlights + " found", + numHighlights == 4); + } + }; + + helper.start(); + } + + public void testGetFuzzyFragments() throws Exception { + TestHighlightRunner helper = new TestHighlightRunner() { + + public void run() throws Exception { + numHighlights = 0; + doSearching("Kinnedy~"); + doStandardHighlights(analyzer, hits, query, HighlighterTest.this); + assertTrue("Failed to find correct number of highlights " + numHighlights + " found", + numHighlights == 5); + } + }; + + helper.start(); + } + + public void testGetWildCardFragments() throws Exception { + TestHighlightRunner helper = new TestHighlightRunner() { + + public void run() throws Exception { + numHighlights = 0; + doSearching("K?nnedy"); + doStandardHighlights(analyzer, hits, query, HighlighterTest.this); + assertTrue("Failed to find correct number of highlights " + numHighlights + " found", + numHighlights == 4); + } + }; + + helper.start(); + } + + public void testGetMidWildCardFragments() throws Exception { + TestHighlightRunner helper = new TestHighlightRunner() { + + public void run() throws Exception { + numHighlights = 0; + doSearching("K*dy"); + doStandardHighlights(analyzer, hits, query, HighlighterTest.this); + assertTrue("Failed to find correct number of highlights " + numHighlights + " found", + numHighlights == 5); + } + }; + + helper.start(); + } + + public void testGetRangeFragments() throws Exception { + TestHighlightRunner helper = new TestHighlightRunner() { + + public void run() throws Exception { + numHighlights = 0; + String queryString = FIELD_NAME + ":[kannedy TO kznnedy]"; + + // Need to explicitly set the QueryParser property to use RangeQuery + // rather + // than RangeFilters + QueryParser parser = new QueryParser(FIELD_NAME, new StandardAnalyzer()); + parser.setUseOldRangeQuery(true); + query = parser.parse(queryString); + doSearching(query); + + doStandardHighlights(analyzer, hits, query, HighlighterTest.this); + assertTrue("Failed to find correct number of highlights " + numHighlights + " found", + numHighlights == 5); + } + }; + + helper.start(); + } + + public void testGetConstantScoreRangeFragments() throws Exception { + + numHighlights = 0; + String queryString = FIELD_NAME + ":[kannedy TO kznnedy]"; + + // Need to explicitly set the QueryParser property to use RangeQuery + // rather + // than RangeFilters + QueryParser parser = new QueryParser(FIELD_NAME, new StandardAnalyzer()); + // parser.setUseOldRangeQuery(true); + query = parser.parse(queryString); + + searcher = new IndexSearcher(ramDir); + // can't rewrite ConstantScoreRangeQuery if you want to highlight it - + // it rewrites to ConstantScoreQuery which cannot be highlighted + // query = unReWrittenQuery.rewrite(reader); + System.out.println("Searching for: " + query.toString(FIELD_NAME)); + hits = searcher.search(query); + + for (int i = 0; i < hits.length(); i++) { + String text = hits.doc(i).get(HighlighterTest.FIELD_NAME); + int maxNumFragmentsRequired = 2; + String fragmentSeparator = "..."; + SpanScorer scorer = null; + TokenStream tokenStream = null; + + tokenStream = new CachingTokenFilter(analyzer.tokenStream(HighlighterTest.FIELD_NAME, + new StringReader(text))); + scorer = new SpanScorer(query, HighlighterTest.FIELD_NAME, (CachingTokenFilter) tokenStream); + scorer.setHighlightCnstScrRngQuery(true); + + Highlighter highlighter = new Highlighter(this, scorer); + + ((CachingTokenFilter) tokenStream).reset(); + + highlighter.setTextFragmenter(new SimpleFragmenter(20)); + + String result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired, + fragmentSeparator); + System.out.println("\t" + result); } + assertTrue("Failed to find correct number of highlights " + numHighlights + " found", + numHighlights == 5); + } - public void testMultiSearcher() throws Exception - { - //setup index 1 - RAMDirectory ramDir1 = new RAMDirectory(); - IndexWriter writer1 = new IndexWriter(ramDir1, new StandardAnalyzer(), true); - Document d = new Document(); - Field f = new Field(FIELD_NAME, "multiOne", Field.Store.YES, Field.Index.TOKENIZED); - d.add(f); - writer1.addDocument(d); - writer1.optimize(); - writer1.close(); - IndexReader reader1 = IndexReader.open(ramDir1); + public void testGetBestFragmentsPhrase() throws Exception { + TestHighlightRunner helper = new TestHighlightRunner() { - //setup index 2 - RAMDirectory ramDir2 = new RAMDirectory(); - IndexWriter writer2 = new IndexWriter(ramDir2, new StandardAnalyzer(), true); - d = new Document(); - f = new Field(FIELD_NAME, "multiTwo", Field.Store.YES, Field.Index.TOKENIZED); - d.add(f); - writer2.addDocument(d); - writer2.optimize(); - writer2.close(); - IndexReader reader2 = IndexReader.open(ramDir2); + public void run() throws Exception { + numHighlights = 0; + doSearching("\"John Kennedy\""); + doStandardHighlights(analyzer, hits, query, HighlighterTest.this); + // Currently highlights "John" and "Kennedy" separately + assertTrue("Failed to find correct number of highlights " + numHighlights + " found", + numHighlights == 2); + } + }; + helper.start(); + } + public void testGetBestFragmentsSpan() throws Exception { + TestHighlightRunner helper = new TestHighlightRunner() { - IndexSearcher searchers[]=new IndexSearcher[2]; - searchers[0] = new IndexSearcher(ramDir1); - searchers[1] = new IndexSearcher(ramDir2); - MultiSearcher multiSearcher=new MultiSearcher(searchers); - QueryParser parser=new QueryParser(FIELD_NAME, new StandardAnalyzer()); - query = parser.parse("multi*"); - System.out.println("Searching for: " + query.toString(FIELD_NAME)); - //at this point the multisearcher calls combine(query[]) - hits = multiSearcher.search(query); + public void run() throws Exception { + numHighlights = 0; + SpanQuery clauses[] = { new SpanTermQuery(new Term("contents", "john")), + new SpanTermQuery(new Term("contents", "kennedy")), }; - //query = QueryParser.parse("multi*", FIELD_NAME, new StandardAnalyzer()); - Query expandedQueries[]=new Query[2]; - expandedQueries[0]=query.rewrite(reader1); - expandedQueries[1]=query.rewrite(reader2); - query=query.combine(expandedQueries); + SpanNearQuery snq = new SpanNearQuery(clauses, 1, true); + doSearching(snq); + doStandardHighlights(analyzer, hits, query, HighlighterTest.this); + // Currently highlights "John" and "Kennedy" separately + assertTrue("Failed to find correct number of highlights " + numHighlights + " found", + numHighlights == 2); + } + }; + helper.start(); + } - //create an instance of the highlighter with the tags used to surround highlighted text - Highlighter highlighter = - new Highlighter(this,new QueryScorer(query)); + public void testOffByOne() throws Exception { + TestHighlightRunner helper = new TestHighlightRunner() { - for (int i = 0; i < hits.length(); i++) - { - String text = hits.doc(i).get(FIELD_NAME); - TokenStream tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(text)); - String highlightedText = highlighter.getBestFragment(tokenStream,text); - System.out.println(highlightedText); - } - assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 2); + public void run() throws Exception { + TermQuery query = new TermQuery(new Term("data", "help")); + Highlighter hg = new Highlighter(new SimpleHTMLFormatter(), new QueryScorer(query)); + hg.setTextFragmenter(new NullFragmenter()); - } - - public void testFieldSpecificHighlighting() throws IOException, ParseException - { - String docMainText="fred is one of the people"; - QueryParser parser=new QueryParser(FIELD_NAME,analyzer); - Query query=parser.parse("fred category:people"); - - //highlighting respects fieldnames used in query - QueryScorer fieldSpecificScorer=new QueryScorer(query, "contents"); - Highlighter fieldSpecificHighlighter = - new Highlighter(new SimpleHTMLFormatter(),fieldSpecificScorer); - fieldSpecificHighlighter.setTextFragmenter(new NullFragmenter()); - String result=fieldSpecificHighlighter.getBestFragment(analyzer,FIELD_NAME,docMainText); - assertEquals("Should match",result,"fred is one of the people"); - - //highlighting does not respect fieldnames used in query - QueryScorer fieldInSpecificScorer=new QueryScorer(query); - Highlighter fieldInSpecificHighlighter = - new Highlighter(new SimpleHTMLFormatter(),fieldInSpecificScorer); - fieldInSpecificHighlighter.setTextFragmenter(new NullFragmenter()); - result=fieldInSpecificHighlighter.getBestFragment(analyzer,FIELD_NAME,docMainText); - assertEquals("Should match",result,"fred is one of the people"); - - - reader.close(); - - } + String match = null; + match = hg.getBestFragment(new StandardAnalyzer(), "data", "help me [54-65]"); + assertEquals("help me [54-65]", match); + } + }; + + helper.start(); + } + + public void testGetBestFragmentsFilteredQuery() throws Exception { + TestHighlightRunner helper = new TestHighlightRunner() { + + public void run() throws Exception { + numHighlights = 0; + RangeFilter rf = new RangeFilter("contents", "john", "john", true, true); + SpanQuery clauses[] = { new SpanTermQuery(new Term("contents", "john")), + new SpanTermQuery(new Term("contents", "kennedy")), }; + SpanNearQuery snq = new SpanNearQuery(clauses, 1, true); + FilteredQuery fq = new FilteredQuery(snq, rf); + + doSearching(fq); + doStandardHighlights(analyzer, hits, query, HighlighterTest.this); + // Currently highlights "John" and "Kennedy" separately + assertTrue("Failed to find correct number of highlights " + numHighlights + " found", + numHighlights == 2); + } + }; + + helper.start(); + } + + public void testGetBestFragmentsFilteredPhraseQuery() throws Exception { + TestHighlightRunner helper = new TestHighlightRunner() { + + public void run() throws Exception { + numHighlights = 0; + RangeFilter rf = new RangeFilter("contents", "john", "john", true, true); + PhraseQuery pq = new PhraseQuery(); + pq.add(new Term("contents", "john")); + pq.add(new Term("contents", "kennedy")); + FilteredQuery fq = new FilteredQuery(pq, rf); + + doSearching(fq); + doStandardHighlights(analyzer, hits, query, HighlighterTest.this); + // Currently highlights "John" and "Kennedy" separately + assertTrue("Failed to find correct number of highlights " + numHighlights + " found", + numHighlights == 2); + } + }; + + helper.start(); + } + + public void testGetBestFragmentsMultiTerm() throws Exception { + TestHighlightRunner helper = new TestHighlightRunner() { + + public void run() throws Exception { + numHighlights = 0; + doSearching("John Kenn*"); + doStandardHighlights(analyzer, hits, query, HighlighterTest.this); + assertTrue("Failed to find correct number of highlights " + numHighlights + " found", + numHighlights == 5); + } + }; + + helper.start(); + } + + public void testGetBestFragmentsWithOr() throws Exception { + TestHighlightRunner helper = new TestHighlightRunner() { + + public void run() throws Exception { + numHighlights = 0; + doSearching("JFK OR Kennedy"); + doStandardHighlights(analyzer, hits, query, HighlighterTest.this); + assertTrue("Failed to find correct number of highlights " + numHighlights + " found", + numHighlights == 5); + } + }; + helper.start(); + } + + public void testGetBestSingleFragment() throws Exception { + + TestHighlightRunner helper = new TestHighlightRunner() { + + public void run() throws Exception { + doSearching("Kennedy"); + numHighlights = 0; + for (int i = 0; i < hits.length(); i++) { + String text = hits.doc(i).get(FIELD_NAME); + TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, new StringReader(text)); + + Highlighter highlighter = getHighlighter(query, FIELD_NAME, tokenStream, + HighlighterTest.this); + highlighter.setTextFragmenter(new SimpleFragmenter(40)); + String result = highlighter.getBestFragment(tokenStream, text); + System.out.println("\t" + result); + } + assertTrue("Failed to find correct number of highlights " + numHighlights + " found", + numHighlights == 4); + + numHighlights = 0; + for (int i = 0; i < hits.length(); i++) { + String text = hits.doc(i).get(FIELD_NAME); + TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, new StringReader(text)); + Highlighter highlighter = getHighlighter(query, FIELD_NAME, tokenStream, + HighlighterTest.this); + highlighter.getBestFragment(analyzer, FIELD_NAME, text); + } + assertTrue("Failed to find correct number of highlights " + numHighlights + " found", + numHighlights == 4); + + numHighlights = 0; + for (int i = 0; i < hits.length(); i++) { + String text = hits.doc(i).get(FIELD_NAME); + + TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, new StringReader(text)); + Highlighter highlighter = getHighlighter(query, FIELD_NAME, tokenStream, + HighlighterTest.this); + highlighter.getBestFragments(analyzer, FIELD_NAME, text, 10); + } + assertTrue("Failed to find correct number of highlights " + numHighlights + " found", + numHighlights == 4); + + } + + }; + + helper.start(); + + } + + public void testGetBestSingleFragmentWithWeights() throws Exception { + + TestHighlightRunner helper = new TestHighlightRunner() { + + public void run() throws Exception { + WeightedSpanTerm[] wTerms = new WeightedSpanTerm[2]; + wTerms[0] = new WeightedSpanTerm(10f, "hello"); + + List positionSpans = new ArrayList(); + positionSpans.add(new PositionSpan(0, 0)); + wTerms[0].addPositionSpans(positionSpans); + + wTerms[1] = new WeightedSpanTerm(1f, "kennedy"); + positionSpans = new ArrayList(); + positionSpans.add(new PositionSpan(14, 14)); + wTerms[1].addPositionSpans(positionSpans); + + Highlighter highlighter = getHighlighter(wTerms, HighlighterTest.this);// new + // Highlighter(new + // QueryScorer(wTerms)); + TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, new StringReader(texts[0])); + highlighter.setTextFragmenter(new SimpleFragmenter(2)); + + String result = highlighter.getBestFragment(tokenStream, texts[0]).trim(); + assertTrue("Failed to find best section using weighted terms. Found: [" + result + "]", + "Hello".equals(result)); + + // readjust weights + wTerms[1].setWeight(50f); + tokenStream = analyzer.tokenStream(FIELD_NAME, new StringReader(texts[0])); + highlighter = getHighlighter(wTerms, HighlighterTest.this); + highlighter.setTextFragmenter(new SimpleFragmenter(2)); + + result = highlighter.getBestFragment(tokenStream, texts[0]).trim(); + assertTrue("Failed to find best section using weighted terms. Found: " + result, + "kennedy".equals(result)); + } + + }; + + helper.start(); + + } + + // tests a "complex" analyzer that produces multiple + // overlapping tokens + public void testOverlapAnalyzer() throws Exception { + TestHighlightRunner helper = new TestHighlightRunner() { + + public void run() throws Exception { + HashMap synonyms = new HashMap(); + synonyms.put("football", "soccer,footie"); + Analyzer analyzer = new SynonymAnalyzer(synonyms); + String srchkey = "football"; + + String s = "football-soccer in the euro 2004 footie competition"; + QueryParser parser = new QueryParser("bookid", analyzer); + Query query = parser.parse(srchkey); + + TokenStream tokenStream = analyzer.tokenStream(null, new StringReader(s)); + Highlighter highlighter = getHighlighter(query, null, tokenStream, HighlighterTest.this); + + // Get 3 best fragments and seperate with a "..." + tokenStream = analyzer.tokenStream(null, new StringReader(s)); + String result = highlighter.getBestFragments(tokenStream, s, 3, "..."); + String expectedResult = "football-soccer in the euro 2004 footie competition"; + assertTrue("overlapping analyzer should handle highlights OK, expected:" + expectedResult + + " actual:" + result, expectedResult.equals(result)); + } + + }; + + helper.start(); + + } + + public void testGetSimpleHighlight() throws Exception { + TestHighlightRunner helper = new TestHighlightRunner() { + + public void run() throws Exception { + numHighlights = 0; + doSearching("Kennedy"); + // new Highlighter(HighlighterTest.this, new QueryScorer(query)); + + for (int i = 0; i < hits.length(); i++) { + String text = hits.doc(i).get(FIELD_NAME); + TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, new StringReader(text)); + Highlighter highlighter = getHighlighter(query, FIELD_NAME, tokenStream, + HighlighterTest.this); + String result = highlighter.getBestFragment(tokenStream, text); + System.out.println("\t" + result); + } + assertTrue("Failed to find correct number of highlights " + numHighlights + " found", + numHighlights == 4); + } + }; + helper.start(); + } + + public void testGetTextFragments() throws Exception { + TestHighlightRunner helper = new TestHighlightRunner() { + + public void run() throws Exception { + + doSearching("Kennedy"); + + for (int i = 0; i < hits.length(); i++) { + String text = hits.doc(i).get(FIELD_NAME); + TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, new StringReader(text)); + + Highlighter highlighter = getHighlighter(query, FIELD_NAME, tokenStream, + HighlighterTest.this);// new Highlighter(this, new + // QueryScorer(query)); + highlighter.setTextFragmenter(new SimpleFragmenter(20)); + String stringResults[] = highlighter.getBestFragments(tokenStream, text, 10); + + tokenStream = analyzer.tokenStream(FIELD_NAME, new StringReader(text)); + TextFragment fragmentResults[] = highlighter.getBestTextFragments(tokenStream, text, + true, 10); + + assertTrue("Failed to find correct number of text Fragments: " + fragmentResults.length + + " vs " + stringResults.length, fragmentResults.length == stringResults.length); + for (int j = 0; j < stringResults.length; j++) { + System.out.println(fragmentResults[j]); + assertTrue("Failed to find same text Fragments: " + fragmentResults[j] + " found", + fragmentResults[j].toString().equals(stringResults[j])); + + } + + } + } + }; + helper.start(); + } + + public void testMaxSizeHighlight() throws Exception { + TestHighlightRunner helper = new TestHighlightRunner() { + + public void run() throws Exception { + numHighlights = 0; + doSearching("meat"); + TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, new StringReader(texts[0])); + Highlighter highlighter = getHighlighter(query, FIELD_NAME, tokenStream, + HighlighterTest.this);// new Highlighter(this, new + // QueryScorer(query)); + highlighter.setMaxDocBytesToAnalyze(30); + + highlighter.getBestFragment(tokenStream, texts[0]); + assertTrue("Setting MaxDocBytesToAnalyze should have prevented " + + "us from finding matches for this record: " + numHighlights + " found", + numHighlights == 0); + } + }; + + helper.start(); + } + + public void testMaxSizeHighlightTruncates() throws Exception { + TestHighlightRunner helper = new TestHighlightRunner() { + + public void run() throws Exception { + String goodWord = "goodtoken"; + String stopWords[] = { "stoppedtoken" }; + + TermQuery query = new TermQuery(new Term("data", goodWord)); + + String match = null; + StringBuffer sb = new StringBuffer(); + sb.append(goodWord); + for (int i = 0; i < 10000; i++) { + sb.append(" "); + sb.append(stopWords[0]); + } + SimpleHTMLFormatter fm = new SimpleHTMLFormatter(); + Highlighter hg = getHighlighter(query, "data", new StandardAnalyzer(stopWords).tokenStream( + "data", new StringReader(sb.toString())), fm);// new Highlighter(fm, + // new + // QueryScorer(query)); + hg.setTextFragmenter(new NullFragmenter()); + hg.setMaxDocBytesToAnalyze(100); + match = hg.getBestFragment(new StandardAnalyzer(stopWords), "data", sb.toString()); + assertTrue("Matched text should be no more than 100 chars in length ", match.length() < hg + .getMaxDocBytesToAnalyze()); + + // add another tokenized word to the overrall length - but set way + // beyond + // the length of text under consideration (after a large slug of stop + // words + // + whitespace) + sb.append(" "); + sb.append(goodWord); + match = hg.getBestFragment(new StandardAnalyzer(stopWords), "data", sb.toString()); + assertTrue("Matched text should be no more than 100 chars in length ", match.length() < hg + .getMaxDocBytesToAnalyze()); + } + }; + + helper.start(); + + } + + public void testUnRewrittenQuery() throws Exception { + TestHighlightRunner helper = new TestHighlightRunner() { + + public void run() throws Exception { + numHighlights = 0; + // test to show how rewritten query can still be used + searcher = new IndexSearcher(ramDir); + Analyzer analyzer = new StandardAnalyzer(); + + QueryParser parser = new QueryParser(FIELD_NAME, analyzer); + Query query = parser.parse("JF? or Kenned*"); + System.out.println("Searching with primitive query"); + // forget to set this and... + // query=query.rewrite(reader); + Hits hits = searcher.search(query); + + // create an instance of the highlighter with the tags used to surround + // highlighted text + // QueryHighlightExtractor highlighter = new + // QueryHighlightExtractor(this, + // query, new StandardAnalyzer()); + + int maxNumFragmentsRequired = 3; + + for (int i = 0; i < hits.length(); i++) { + String text = hits.doc(i).get(FIELD_NAME); + TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, new StringReader(text)); + Highlighter highlighter = getHighlighter(query, FIELD_NAME, tokenStream, + HighlighterTest.this); + highlighter.setTextFragmenter(new SimpleFragmenter(40)); + String highlightedText = highlighter.getBestFragments(tokenStream, text, + maxNumFragmentsRequired, "..."); + System.out.println(highlightedText); + } + // We expect to have zero highlights if the query is multi-terms and is + // not + // rewritten! + assertTrue("Failed to find correct number of highlights " + numHighlights + " found", + numHighlights == 0); + } + }; + + helper.start(); + } + + public void testNoFragments() throws Exception { + TestHighlightRunner helper = new TestHighlightRunner() { + + public void run() throws Exception { + doSearching("AnInvalidQueryWhichShouldYieldNoResults"); + + for (int i = 0; i < texts.length; i++) { + String text = texts[i]; + TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, new StringReader(text)); + Highlighter highlighter = getHighlighter(query, FIELD_NAME, tokenStream, + HighlighterTest.this); + String result = highlighter.getBestFragment(tokenStream, text); + assertNull("The highlight result should be null for text with no query terms", result); + } + } + }; + + helper.start(); + } + + /** + * Demonstrates creation of an XHTML compliant doc using new encoding facilities. + * + * @throws Exception + */ + public void testEncoding() throws Exception { + + String rawDocContent = "\"Smith & sons' prices < 3 and >4\" claims article"; + // run the highlighter on the raw content (scorer does not score any tokens + // for + // highlighting but scores a single fragment for selection + Highlighter highlighter = new Highlighter(this, new SimpleHTMLEncoder(), new Scorer() { + public void startFragment(TextFragment newFragment) { + } + + public float getTokenScore(Token token) { + return 0; + } + + public float getFragmentScore() { + return 1; + } + }); + highlighter.setTextFragmenter(new SimpleFragmenter(2000)); + TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, new StringReader(rawDocContent)); + + String encodedSnippet = highlighter.getBestFragments(tokenStream, rawDocContent, 1, ""); + // An ugly bit of XML creation: + String xhtml = "\n" + "\n" + + "\n" + + "\n" + "My Test HTML Document\n" + "\n" + "\n" + "

" + + encodedSnippet + "

\n" + "\n" + ""; + // now an ugly built of XML parsing to test the snippet is encoded OK + DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance(); + DocumentBuilder db = dbf.newDocumentBuilder(); + org.w3c.dom.Document doc = db.parse(new ByteArrayInputStream(xhtml.getBytes())); + Element root = doc.getDocumentElement(); + NodeList nodes = root.getElementsByTagName("body"); + Element body = (Element) nodes.item(0); + nodes = body.getElementsByTagName("h2"); + Element h2 = (Element) nodes.item(0); + String decodedSnippet = h2.getFirstChild().getNodeValue(); + assertEquals("XHTML Encoding should have worked:", rawDocContent, decodedSnippet); + } + + public void testMultiSearcher() throws Exception { + // setup index 1 + RAMDirectory ramDir1 = new RAMDirectory(); + IndexWriter writer1 = new IndexWriter(ramDir1, new StandardAnalyzer(), true); + Document d = new Document(); + Field f = new Field(FIELD_NAME, "multiOne", Field.Store.YES, Field.Index.TOKENIZED); + d.add(f); + writer1.addDocument(d); + writer1.optimize(); + writer1.close(); + IndexReader reader1 = IndexReader.open(ramDir1); + + // setup index 2 + RAMDirectory ramDir2 = new RAMDirectory(); + IndexWriter writer2 = new IndexWriter(ramDir2, new StandardAnalyzer(), true); + d = new Document(); + f = new Field(FIELD_NAME, "multiTwo", Field.Store.YES, Field.Index.TOKENIZED); + d.add(f); + writer2.addDocument(d); + writer2.optimize(); + writer2.close(); + IndexReader reader2 = IndexReader.open(ramDir2); + + IndexSearcher searchers[] = new IndexSearcher[2]; + searchers[0] = new IndexSearcher(ramDir1); + searchers[1] = new IndexSearcher(ramDir2); + MultiSearcher multiSearcher = new MultiSearcher(searchers); + QueryParser parser = new QueryParser(FIELD_NAME, new StandardAnalyzer()); + query = parser.parse("multi*"); + System.out.println("Searching for: " + query.toString(FIELD_NAME)); + // at this point the multisearcher calls combine(query[]) + hits = multiSearcher.search(query); + + // query = QueryParser.parse("multi*", FIELD_NAME, new StandardAnalyzer()); + Query expandedQueries[] = new Query[2]; + expandedQueries[0] = query.rewrite(reader1); + expandedQueries[1] = query.rewrite(reader2); + query = query.combine(expandedQueries); + + // create an instance of the highlighter with the tags used to surround + // highlighted text + Highlighter highlighter = new Highlighter(this, new QueryScorer(query)); + + for (int i = 0; i < hits.length(); i++) { + String text = hits.doc(i).get(FIELD_NAME); + TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, new StringReader(text)); + String highlightedText = highlighter.getBestFragment(tokenStream, text); + System.out.println(highlightedText); + } + assertTrue("Failed to find correct number of highlights " + numHighlights + " found", + numHighlights == 2); + + } + + public void testFieldSpecificHighlighting() throws Exception { + TestHighlightRunner helper = new TestHighlightRunner() { + + public void run() throws Exception { + String docMainText = "fred is one of the people"; + QueryParser parser = new QueryParser(FIELD_NAME, analyzer); + Query query = parser.parse("fred category:people"); + + // highlighting respects fieldnames used in query + + Scorer fieldSpecificScorer = null; + if (mode == this.SPAN) { + TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, new StringReader(docMainText)); + CachingTokenFilter ctf = new CachingTokenFilter(tokenStream); + fieldSpecificScorer = new SpanScorer(query, FIELD_NAME, ctf); + } else if (mode == this.STANDARD) { + fieldSpecificScorer = new QueryScorer(query, "contents"); + } + Highlighter fieldSpecificHighlighter = new Highlighter(new SimpleHTMLFormatter(), + fieldSpecificScorer); + fieldSpecificHighlighter.setTextFragmenter(new NullFragmenter()); + String result = fieldSpecificHighlighter.getBestFragment(analyzer, FIELD_NAME, docMainText); + assertEquals("Should match", result, "fred is one of the people"); + + // highlighting does not respect fieldnames used in query + Scorer fieldInSpecificScorer = null; + if (mode == this.SPAN) { + TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, new StringReader(docMainText)); + CachingTokenFilter ctf = new CachingTokenFilter(tokenStream); + fieldInSpecificScorer = new SpanScorer(query, null, ctf); + } else if (mode == this.STANDARD) { + fieldInSpecificScorer = new QueryScorer(query); + } + + Highlighter fieldInSpecificHighlighter = new Highlighter(new SimpleHTMLFormatter(), + fieldInSpecificScorer); + fieldInSpecificHighlighter.setTextFragmenter(new NullFragmenter()); + result = fieldInSpecificHighlighter.getBestFragment(analyzer, FIELD_NAME, docMainText); + assertEquals("Should match", result, "fred is one of the people"); + + reader.close(); + } + }; + + helper.start(); + + } + protected TokenStream getTS2() { - //String s = "Hi-Speed10 foo"; + // String s = "Hi-Speed10 foo"; return new TokenStream() { Iterator iter; List lst; { lst = new ArrayList(); Token t; - t = new Token("hi",0,2); + t = new Token("hi", 0, 2); lst.add(t); - t = new Token("hispeed",0,8); + t = new Token("hispeed", 0, 8); lst.add(t); - t = new Token("speed",3,8); + t = new Token("speed", 3, 8); t.setPositionIncrement(0); lst.add(t); - t = new Token("10",8,10); + t = new Token("10", 8, 10); lst.add(t); - t = new Token("foo",11,14); + t = new Token("foo", 11, 14); lst.add(t); iter = lst.iterator(); } + public Token next() throws IOException { - return iter.hasNext() ? (Token)iter.next() : null; + return iter.hasNext() ? (Token) iter.next() : null; } }; } // same token-stream as above, but the bigger token comes first this time protected TokenStream getTS2a() { - //String s = "Hi-Speed10 foo"; + // String s = "Hi-Speed10 foo"; return new TokenStream() { Iterator iter; List lst; { lst = new ArrayList(); Token t; - t = new Token("hispeed",0,8); + t = new Token("hispeed", 0, 8); lst.add(t); - t = new Token("hi",0,2); + t = new Token("hi", 0, 2); t.setPositionIncrement(0); lst.add(t); - t = new Token("speed",3,8); + t = new Token("speed", 3, 8); lst.add(t); - t = new Token("10",8,10); + t = new Token("10", 8, 10); lst.add(t); - t = new Token("foo",11,14); + t = new Token("foo", 11, 14); lst.add(t); iter = lst.iterator(); } + public Token next() throws IOException { - return iter.hasNext() ? (Token)iter.next() : null; + return iter.hasNext() ? (Token) iter.next() : null; } }; } - public void testOverlapAnalyzer2() throws Exception - { + public void testOverlapAnalyzer2() throws Exception { + TestHighlightRunner helper = new TestHighlightRunner() { - String s = "Hi-Speed10 foo"; + public void run() throws Exception { + String s = "Hi-Speed10 foo"; - Query query; Highlighter highlighter; String result; + Query query; + Highlighter highlighter; + String result; - query = new QueryParser("text",new WhitespaceAnalyzer()).parse("foo"); - highlighter = new Highlighter(new QueryScorer(query)); - result = highlighter.getBestFragments(getTS2(), s, 3, "..."); - assertEquals("Hi-Speed10 foo",result); + query = new QueryParser("text", new WhitespaceAnalyzer()).parse("foo"); + highlighter = getHighlighter(query, "text", getTS2(), HighlighterTest.this); + result = highlighter.getBestFragments(getTS2(), s, 3, "..."); + assertEquals("Hi-Speed10 foo", result); - query = new QueryParser("text",new WhitespaceAnalyzer()).parse("10"); - highlighter = new Highlighter(new QueryScorer(query)); - result = highlighter.getBestFragments(getTS2(), s, 3, "..."); - assertEquals("Hi-Speed10 foo",result); + query = new QueryParser("text", new WhitespaceAnalyzer()).parse("10"); + highlighter = getHighlighter(query, "text", getTS2(), HighlighterTest.this); + result = highlighter.getBestFragments(getTS2(), s, 3, "..."); + assertEquals("Hi-Speed10 foo", result); - query = new QueryParser("text",new WhitespaceAnalyzer()).parse("hi"); - highlighter = new Highlighter(new QueryScorer(query)); - result = highlighter.getBestFragments(getTS2(), s, 3, "..."); - assertEquals("Hi-Speed10 foo",result); + query = new QueryParser("text", new WhitespaceAnalyzer()).parse("hi"); + highlighter = getHighlighter(query, "text", getTS2(), HighlighterTest.this); + result = highlighter.getBestFragments(getTS2(), s, 3, "..."); + assertEquals("Hi-Speed10 foo", result); - query = new QueryParser("text",new WhitespaceAnalyzer()).parse("speed"); - highlighter = new Highlighter(new QueryScorer(query)); - result = highlighter.getBestFragments(getTS2(), s, 3, "..."); - assertEquals("Hi-Speed10 foo",result); + query = new QueryParser("text", new WhitespaceAnalyzer()).parse("speed"); + highlighter = getHighlighter(query, "text", getTS2(), HighlighterTest.this); + result = highlighter.getBestFragments(getTS2(), s, 3, "..."); + assertEquals("Hi-Speed10 foo", result); - query = new QueryParser("text",new WhitespaceAnalyzer()).parse("hispeed"); - highlighter = new Highlighter(new QueryScorer(query)); - result = highlighter.getBestFragments(getTS2(), s, 3, "..."); - assertEquals("Hi-Speed10 foo",result); + query = new QueryParser("text", new WhitespaceAnalyzer()).parse("hispeed"); + highlighter = getHighlighter(query, "text", getTS2(), HighlighterTest.this); + result = highlighter.getBestFragments(getTS2(), s, 3, "..."); + assertEquals("Hi-Speed10 foo", result); - query = new QueryParser("text",new WhitespaceAnalyzer()).parse("hi speed"); - highlighter = new Highlighter(new QueryScorer(query)); - result = highlighter.getBestFragments(getTS2(), s, 3, "..."); - assertEquals("Hi-Speed10 foo",result); + query = new QueryParser("text", new WhitespaceAnalyzer()).parse("hi speed"); + highlighter = getHighlighter(query, "text", getTS2(), HighlighterTest.this); + result = highlighter.getBestFragments(getTS2(), s, 3, "..."); + assertEquals("Hi-Speed10 foo", result); - /////////////////// same tests, just put the bigger overlapping token first - query = new QueryParser("text",new WhitespaceAnalyzer()).parse("foo"); - highlighter = new Highlighter(new QueryScorer(query)); - result = highlighter.getBestFragments(getTS2a(), s, 3, "..."); - assertEquals("Hi-Speed10 foo",result); + // ///////////////// same tests, just put the bigger overlapping token + // first + query = new QueryParser("text", new WhitespaceAnalyzer()).parse("foo"); + highlighter = getHighlighter(query, "text", getTS2a(), HighlighterTest.this); + result = highlighter.getBestFragments(getTS2a(), s, 3, "..."); + assertEquals("Hi-Speed10 foo", result); - query = new QueryParser("text",new WhitespaceAnalyzer()).parse("10"); - highlighter = new Highlighter(new QueryScorer(query)); - result = highlighter.getBestFragments(getTS2a(), s, 3, "..."); - assertEquals("Hi-Speed10 foo",result); + query = new QueryParser("text", new WhitespaceAnalyzer()).parse("10"); + highlighter = getHighlighter(query, "text", getTS2a(), HighlighterTest.this); + result = highlighter.getBestFragments(getTS2a(), s, 3, "..."); + assertEquals("Hi-Speed10 foo", result); - query = new QueryParser("text",new WhitespaceAnalyzer()).parse("hi"); - highlighter = new Highlighter(new QueryScorer(query)); - result = highlighter.getBestFragments(getTS2a(), s, 3, "..."); - assertEquals("Hi-Speed10 foo",result); + query = new QueryParser("text", new WhitespaceAnalyzer()).parse("hi"); + highlighter = getHighlighter(query, "text", getTS2a(), HighlighterTest.this); + result = highlighter.getBestFragments(getTS2a(), s, 3, "..."); + assertEquals("Hi-Speed10 foo", result); - query = new QueryParser("text",new WhitespaceAnalyzer()).parse("speed"); - highlighter = new Highlighter(new QueryScorer(query)); - result = highlighter.getBestFragments(getTS2a(), s, 3, "..."); - assertEquals("Hi-Speed10 foo",result); + query = new QueryParser("text", new WhitespaceAnalyzer()).parse("speed"); + highlighter = getHighlighter(query, "text", getTS2a(), HighlighterTest.this); + result = highlighter.getBestFragments(getTS2a(), s, 3, "..."); + assertEquals("Hi-Speed10 foo", result); - query = new QueryParser("text",new WhitespaceAnalyzer()).parse("hispeed"); - highlighter = new Highlighter(new QueryScorer(query)); - result = highlighter.getBestFragments(getTS2a(), s, 3, "..."); - assertEquals("Hi-Speed10 foo",result); + query = new QueryParser("text", new WhitespaceAnalyzer()).parse("hispeed"); + highlighter = getHighlighter(query, "text", getTS2a(), HighlighterTest.this); + result = highlighter.getBestFragments(getTS2a(), s, 3, "..."); + assertEquals("Hi-Speed10 foo", result); - query = new QueryParser("text",new WhitespaceAnalyzer()).parse("hi speed"); - highlighter = new Highlighter(new QueryScorer(query)); - result = highlighter.getBestFragments(getTS2a(), s, 3, "..."); - assertEquals("Hi-Speed10 foo",result); + query = new QueryParser("text", new WhitespaceAnalyzer()).parse("hi speed"); + highlighter = getHighlighter(query, "text", getTS2a(), HighlighterTest.this); + result = highlighter.getBestFragments(getTS2a(), s, 3, "..."); + assertEquals("Hi-Speed10 foo", result); + } + }; + + helper.start(); } + /* + * + * public void testBigramAnalyzer() throws IOException, ParseException { + * //test to ensure analyzers with none-consecutive start/end offsets //dont + * double-highlight text //setup index 1 RAMDirectory ramDir = new + * RAMDirectory(); Analyzer bigramAnalyzer=new CJKAnalyzer(); IndexWriter + * writer = new IndexWriter(ramDir,bigramAnalyzer , true); Document d = new + * Document(); Field f = new Field(FIELD_NAME, "java abc def", true, true, + * true); d.add(f); writer.addDocument(d); writer.close(); IndexReader reader = + * IndexReader.open(ramDir); + * + * IndexSearcher searcher=new IndexSearcher(reader); query = + * QueryParser.parse("abc", FIELD_NAME, bigramAnalyzer); + * System.out.println("Searching for: " + query.toString(FIELD_NAME)); hits = + * searcher.search(query); + * + * Highlighter highlighter = new Highlighter(this,new + * QueryFragmentScorer(query)); + * + * for (int i = 0; i < hits.length(); i++) { String text = + * hits.doc(i).get(FIELD_NAME); TokenStream + * tokenStream=bigramAnalyzer.tokenStream(FIELD_NAME,new StringReader(text)); + * String highlightedText = highlighter.getBestFragment(tokenStream,text); + * System.out.println(highlightedText); } } + */ -/* + public String highlightTerm(String originalText, TokenGroup group) { + if (group.getTotalScore() <= 0) { + return originalText; + } + numHighlights++; // update stats used in assertions + return "" + originalText + ""; + } - public void testBigramAnalyzer() throws IOException, ParseException - { - //test to ensure analyzers with none-consecutive start/end offsets - //dont double-highlight text - //setup index 1 - RAMDirectory ramDir = new RAMDirectory(); - Analyzer bigramAnalyzer=new CJKAnalyzer(); - IndexWriter writer = new IndexWriter(ramDir,bigramAnalyzer , true); - Document d = new Document(); - Field f = new Field(FIELD_NAME, "java abc def", true, true, true); - d.add(f); - writer.addDocument(d); - writer.close(); - IndexReader reader = IndexReader.open(ramDir); + public void doSearching(String queryString) throws Exception { + QueryParser parser = new QueryParser(FIELD_NAME, new StandardAnalyzer()); + query = parser.parse(queryString); + doSearching(query); + } - IndexSearcher searcher=new IndexSearcher(reader); - query = QueryParser.parse("abc", FIELD_NAME, bigramAnalyzer); - System.out.println("Searching for: " + query.toString(FIELD_NAME)); - hits = searcher.search(query); + public void doSearching(Query unReWrittenQuery) throws Exception { + searcher = new IndexSearcher(ramDir); + // for any multi-term queries to work (prefix, wildcard, range,fuzzy etc) + // you must use a rewritten query! + query = unReWrittenQuery.rewrite(reader); + System.out.println("Searching for: " + query.toString(FIELD_NAME)); + hits = searcher.search(query); + } - Highlighter highlighter = - new Highlighter(this,new QueryFragmentScorer(query)); + public void assertExpectedHighlightCount(final int maxNumFragmentsRequired, + final int expectedHighlights) throws Exception { + for (int i = 0; i < hits.length(); i++) { + String text = hits.doc(i).get(FIELD_NAME); + CachingTokenFilter tokenStream = new CachingTokenFilter(analyzer.tokenStream(FIELD_NAME, + new StringReader(text))); + Highlighter highlighter = new Highlighter(this, + new SpanScorer(query, FIELD_NAME, tokenStream)); + highlighter.setTextFragmenter(new SimpleFragmenter(40)); + tokenStream.reset(); - for (int i = 0; i < hits.length(); i++) - { - String text = hits.doc(i).get(FIELD_NAME); - TokenStream tokenStream=bigramAnalyzer.tokenStream(FIELD_NAME,new StringReader(text)); - String highlightedText = highlighter.getBestFragment(tokenStream,text); - System.out.println(highlightedText); - } + String result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired, + "..."); + System.out.println("\t" + result); - } -*/ + assertTrue("Failed to find correct number of highlights " + numHighlights + " found", + numHighlights == expectedHighlights); + } + } + /* + * @see TestCase#setUp() + */ + protected void setUp() throws Exception { + ramDir = new RAMDirectory(); + IndexWriter writer = new IndexWriter(ramDir, new StandardAnalyzer(), true); + for (int i = 0; i < texts.length; i++) { + addDoc(writer, texts[i]); + } - public String highlightTerm(String originalText , TokenGroup group) - { - if(group.getTotalScore()<=0) - { - return originalText; - } - numHighlights++; //update stats used in assertions - return "" + originalText + ""; - } - - public void doSearching(String queryString) throws Exception - { - QueryParser parser=new QueryParser(FIELD_NAME, new StandardAnalyzer()); - query = parser.parse(queryString); - doSearching(query); - } - public void doSearching(Query unReWrittenQuery) throws Exception - { - searcher = new IndexSearcher(ramDir); - //for any multi-term queries to work (prefix, wildcard, range,fuzzy etc) you must use a rewritten query! - query=unReWrittenQuery.rewrite(reader); - System.out.println("Searching for: " + query.toString(FIELD_NAME)); - hits = searcher.search(query); - } + writer.optimize(); + writer.close(); + reader = IndexReader.open(ramDir); + numHighlights = 0; + } - void doStandardHighlights() throws Exception - { - Highlighter highlighter =new Highlighter(this,new QueryScorer(query)); - highlighter.setTextFragmenter(new SimpleFragmenter(20)); - for (int i = 0; i < hits.length(); i++) - { - String text = hits.doc(i).get(FIELD_NAME); - int maxNumFragmentsRequired = 2; - String fragmentSeparator = "..."; - TokenStream tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(text)); + private void addDoc(IndexWriter writer, String text) throws IOException { + Document d = new Document(); + Field f = new Field(FIELD_NAME, text, Field.Store.YES, Field.Index.TOKENIZED); + d.add(f); + writer.addDocument(d); - String result = - highlighter.getBestFragments( - tokenStream, - text, - maxNumFragmentsRequired, - fragmentSeparator); - System.out.println("\t" + result); - } - } + } - /* - * @see TestCase#setUp() - */ - protected void setUp() throws Exception - { - ramDir = new RAMDirectory(); - IndexWriter writer = new IndexWriter(ramDir, new StandardAnalyzer(), true); - for (int i = 0; i < texts.length; i++) - { - addDoc(writer, texts[i]); - } + /* + * @see TestCase#tearDown() + */ + protected void tearDown() throws Exception { + super.tearDown(); + } - writer.optimize(); - writer.close(); - reader = IndexReader.open(ramDir); - numHighlights = 0; - } - - private void addDoc(IndexWriter writer, String text) throws IOException - { - Document d = new Document(); - Field f = new Field(FIELD_NAME, text,Field.Store.YES, Field.Index.TOKENIZED); - d.add(f); - writer.addDocument(d); - - } - - /* - * @see TestCase#tearDown() - */ - protected void tearDown() throws Exception - { - super.tearDown(); - } - } - -//=================================================================== -//========== BEGIN TEST SUPPORTING CLASSES -//========== THESE LOOK LIKE, WITH SOME MORE EFFORT THESE COULD BE -//========== MADE MORE GENERALLY USEFUL. +// =================================================================== +// ========== BEGIN TEST SUPPORTING CLASSES +// ========== THESE LOOK LIKE, WITH SOME MORE EFFORT THESE COULD BE +// ========== MADE MORE GENERALLY USEFUL. // TODO - make synonyms all interchangeable with each other and produce // a version that does hyponyms - the "is a specialised type of ...." // so that car = audi, bmw and volkswagen but bmw != audi so different // behaviour to synonyms -//=================================================================== +// =================================================================== -class SynonymAnalyzer extends Analyzer -{ - private Map synonyms; +class SynonymAnalyzer extends Analyzer { + private Map synonyms; - public SynonymAnalyzer(Map synonyms) - { - this.synonyms = synonyms; - } + public SynonymAnalyzer(Map synonyms) { + this.synonyms = synonyms; + } - /* (non-Javadoc) - * @see org.apache.lucene.analysis.Analyzer#tokenStream(java.lang.String, java.io.Reader) - */ - public TokenStream tokenStream(String arg0, Reader arg1) - { - return new SynonymTokenizer(new LowerCaseTokenizer(arg1), synonyms); - } + /* + * (non-Javadoc) + * + * @see org.apache.lucene.analysis.Analyzer#tokenStream(java.lang.String, + * java.io.Reader) + */ + public TokenStream tokenStream(String arg0, Reader arg1) { + return new SynonymTokenizer(new LowerCaseTokenizer(arg1), synonyms); + } } /** * Expands a token stream with synonyms (TODO - make the synonyms analyzed by choice of analyzer) + * * @author MAHarwood */ -class SynonymTokenizer extends TokenStream -{ - private TokenStream realStream; - private Token currentRealToken = null; - private Map synonyms; - StringTokenizer st = null; - public SynonymTokenizer(TokenStream realStream, Map synonyms) - { - this.realStream = realStream; - this.synonyms = synonyms; - } - public Token next() throws IOException - { - if (currentRealToken == null) - { - Token nextRealToken = realStream.next(); - if (nextRealToken == null) - { - return null; - } - String expansions = (String) synonyms.get(nextRealToken.termText()); - if (expansions == null) - { - return nextRealToken; - } - st = new StringTokenizer(expansions, ","); - if (st.hasMoreTokens()) - { - currentRealToken = nextRealToken; - } - return currentRealToken; - } - else - { - String nextExpandedValue = st.nextToken(); - Token expandedToken = - new Token( - nextExpandedValue, - currentRealToken.startOffset(), - currentRealToken.endOffset()); - expandedToken.setPositionIncrement(0); - if (!st.hasMoreTokens()) - { - currentRealToken = null; - st = null; - } - return expandedToken; - } - } +class SynonymTokenizer extends TokenStream { + private TokenStream realStream; + private Token currentRealToken = null; + private Map synonyms; + StringTokenizer st = null; -} + public SynonymTokenizer(TokenStream realStream, Map synonyms) { + this.realStream = realStream; + this.synonyms = synonyms; + } + public Token next() throws IOException { + if (currentRealToken == null) { + Token nextRealToken = realStream.next(); + if (nextRealToken == null) { + return null; + } + String expansions = (String) synonyms.get(nextRealToken.termText()); + if (expansions == null) { + return nextRealToken; + } + st = new StringTokenizer(expansions, ","); + if (st.hasMoreTokens()) { + currentRealToken = nextRealToken; + } + return currentRealToken; + } else { + String nextExpandedValue = st.nextToken(); + Token expandedToken = new Token(nextExpandedValue, currentRealToken.startOffset(), + currentRealToken.endOffset()); + expandedToken.setPositionIncrement(0); + if (!st.hasMoreTokens()) { + currentRealToken = null; + st = null; + } + return expandedToken; + } + } + static abstract class TestHighlightRunner { + static final int STANDARD = 0; + static final int SPAN = 1; + int mode = STANDARD; + + public Highlighter getHighlighter(Query query, String fieldName, TokenStream stream, + Formatter formatter) { + if (mode == STANDARD) { + return new Highlighter(formatter, new QueryScorer(query)); + } else if (mode == SPAN) { + CachingTokenFilter tokenStream = new CachingTokenFilter(stream); + Highlighter highlighter; + try { + highlighter = new Highlighter(formatter, new SpanScorer(query, fieldName, tokenStream)); + tokenStream.reset(); + } catch (IOException e) { + throw new RuntimeException(e); + } + + return highlighter; + } else { + throw new RuntimeException("Unknown highlight mode"); + } + } + + Highlighter getHighlighter(WeightedTerm[] weightedTerms, Formatter formatter) { + if (mode == STANDARD) { + return new Highlighter(formatter, new QueryScorer(weightedTerms)); + } else if (mode == SPAN) { + Highlighter highlighter; + + highlighter = new Highlighter(formatter, new SpanScorer((WeightedSpanTerm[]) weightedTerms)); + + return highlighter; + } else { + throw new RuntimeException("Unknown highlight mode"); + } + } + + void doStandardHighlights(Analyzer analyzer, Hits hits, Query query, Formatter formatter) + throws Exception { + + for (int i = 0; i < hits.length(); i++) { + String text = hits.doc(i).get(HighlighterTest.FIELD_NAME); + int maxNumFragmentsRequired = 2; + String fragmentSeparator = "..."; + Scorer scorer = null; + TokenStream tokenStream = null; + if (mode == SPAN) { + tokenStream = new CachingTokenFilter(analyzer.tokenStream(HighlighterTest.FIELD_NAME, + new StringReader(text))); + scorer = new SpanScorer(query, HighlighterTest.FIELD_NAME, + (CachingTokenFilter) tokenStream); + } else if (mode == STANDARD) { + scorer = new QueryScorer(query); + tokenStream = analyzer.tokenStream(HighlighterTest.FIELD_NAME, new StringReader(text)); + } + Highlighter highlighter = new Highlighter(formatter, scorer); + if (mode == SPAN) { + ((CachingTokenFilter) tokenStream).reset(); + } + highlighter.setTextFragmenter(new SimpleFragmenter(20)); + + String result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired, + fragmentSeparator); + System.out.println("\t" + result); + } + } + + abstract void run() throws Exception; + + void start() throws Exception { + System.out.println("Run standard"); + run(); + System.out.println("Run span"); + mode = SPAN; + run(); + } + } +}