Index: contrib/highlighter/build.xml =================================================================== --- contrib/highlighter/build.xml (revision 586431) +++ contrib/highlighter/build.xml (working copy) @@ -1,27 +1,28 @@ - + - - - Hits highlighter + Hits highlighter + + + + + + + + + + + + + + Highlighter building dependency ${memory.jar} + + + + Index: contrib/highlighter/src/java/org/apache/lucene/search/highlight/SimpleSpanFragmenter.java =================================================================== --- contrib/highlighter/src/java/org/apache/lucene/search/highlight/SimpleSpanFragmenter.java (revision 0) +++ contrib/highlighter/src/java/org/apache/lucene/search/highlight/SimpleSpanFragmenter.java (revision 0) @@ -0,0 +1,95 @@ +package org.apache.lucene.search.highlight; + + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +import org.apache.lucene.analysis.Token; + +import java.util.List; + + +/** + * {@link Fragmenter} implementation which breaks text up into same-size + * fragments but does not split up Spans. This is a simple sample class. + */ +public class SimpleSpanFragmenter implements Fragmenter { + private static final int DEFAULT_FRAGMENT_SIZE = 100; + private int fragmentSize; + private int currentNumFrags; + private int position = -1; + private SpanScorer spanScorer; + private int waitForPos = -1; + + /** + * @param spanscorer SpanScorer that was used to score hits + */ + public SimpleSpanFragmenter(SpanScorer spanscorer) { + this(spanscorer, DEFAULT_FRAGMENT_SIZE); + } + + /** + * @param spanscorer SpanScorer that was used to score hits + * @param fragmentSize size in bytes of each fragment + */ + public SimpleSpanFragmenter(SpanScorer spanscorer, int fragmentSize) { + this.fragmentSize = fragmentSize; + this.spanScorer = spanscorer; + } + + /* (non-Javadoc) + * @see org.apache.lucene.search.highlight.Fragmenter#isNewFragment(org.apache.lucene.analysis.Token) + */ + public boolean isNewFragment(Token token) { + position += token.getPositionIncrement(); + + if (waitForPos == position) { + waitForPos = -1; + } else if (waitForPos != -1) { + return false; + } + + WeightedSpanTerm wSpanTerm = spanScorer.getWeightedSpanTerm(token.termText()); + + if (wSpanTerm != null) { + List positionSpans = wSpanTerm.getPositionSpans(); + + for (int i = 0; i < positionSpans.size(); i++) { + if (((PositionSpan) positionSpans.get(i)).start == position) { + waitForPos = ((PositionSpan) positionSpans.get(i)).end + 1; + + return true; + } + } + } + + boolean isNewFrag = token.endOffset() >= (fragmentSize * currentNumFrags); + + if (isNewFrag) { + currentNumFrags++; + } + + return isNewFrag; + } + + /* (non-Javadoc) + * @see org.apache.lucene.search.highlight.Fragmenter#start(java.lang.String) + */ + public void start(String originalText) { + position = 0; + currentNumFrags = 1; + } +} Index: contrib/highlighter/src/java/org/apache/lucene/search/highlight/spanscorer.html =================================================================== --- contrib/highlighter/src/java/org/apache/lucene/search/highlight/spanscorer.html (revision 0) +++ contrib/highlighter/src/java/org/apache/lucene/search/highlight/spanscorer.html (revision 0) @@ -0,0 +1,54 @@ + + +

+The spanscorer classes provide the Highlighter with the ability +to only highlight the Tokens that contributed to a query match. +The SpanScorer class is the central component and it will attempt to score Terms +based on whether they actually participated in scoring the Query. +

+

+The implementation is very similar to QueryScorer in that WeightedSpanTerms are extracted +from the given Query and then placed in a Map. During Token scoring, Terms found in +the Map return a score equal to their weight. The added wrinkle is that when terms are +extracted, the sub-queries that make up the Query are converted to SpanQuery's and +SpanQuery.getSpans() is applied to a MemoryIndex containing the TokenStream of the text to +be highlighted if the sub-query is position sensitive. The start and end positions of the +matching Spans are recorded with the respective WeightedSpanTerms and these positions are +then used to filter possible Token matches during scoring. +

+

Example Usage

+ +
+	IndexSearcher searcher = new IndexSearcher(ramDir);
+	Query query = QueryParser.parse("Kenne*", FIELD_NAME, analyzer);
+	query = query.rewrite(reader); //required to expand search terms
+	Hits hits = searcher.search(query);
+
+	for (int i = 0; i < hits.length(); i++)
+	{
+		String text = hits.doc(i).get(FIELD_NAME);
+		CachingTokenFilter tokenStream = new CachingTokenFilter(analyzer.tokenStream(
+                        FIELD_NAME, new StringReader(text)));
+        Highlighter highlighter = new Highlighter(new SpanScorer(query, FIELD_NAME, tokenStream));
+        tokenStream.reset();
+        
+        // Get 3 best fragments and seperate with a "..."
+		String result = highlighter.getBestFragments(tokenStream, text, 3, "...");
+		System.out.println(result);
+	}
+
+ +

+If you make a call to any of the getBestFragments() methods more than once, you must call reset() on the SpanScorer +between each call. +

+ +

The SpanScorer class has a constructor which can use an IndexReader to derive the IDF (inverse document frequency) +for each term in order to influence the score. This is useful for helping to extracting the most significant sections +of a document and in supplying scores used by the GradientFormatter to color significant words more strongly. +The SpanScorer.getMaxWeight method is useful when passed to the GradientFormatter constructor to define the top score +which is associated with the top color.

+ + + + Index: contrib/highlighter/src/java/org/apache/lucene/search/highlight/SpanScorer.java =================================================================== --- contrib/highlighter/src/java/org/apache/lucene/search/highlight/SpanScorer.java (revision 0) +++ contrib/highlighter/src/java/org/apache/lucene/search/highlight/SpanScorer.java (revision 0) @@ -0,0 +1,206 @@ +package org.apache.lucene.search.highlight; + +import org.apache.lucene.analysis.CachingTokenFilter; +import org.apache.lucene.analysis.Token; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.search.Query; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +import java.io.IOException; + +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; + + +/** + * {@link Scorer} implementation which scores text fragments by the number of + * unique query terms found. This class converts appropriate Querys to + * SpanQuerys and attempts to score only those terms that participated in + * generating the 'hit' on the document. + */ +public class SpanScorer implements Scorer { + private float totalScore; + private Set foundTerms; + private Map fieldWeightedSpanTerms; + private float maxTermWeight; + private int position = -1; + private String defaultField; + + /** + * @param query + * Query to use for highlighting + * @param field + * Field to highlight - pass null to ignore fields + * @param tokenStream + * of source text to be highlighted + * @throws IOException + */ + public SpanScorer(Query query, String field, + CachingTokenFilter cachingTokenFilter) throws IOException { + init(query, field, cachingTokenFilter, null); + } + + /** + * @param defaultField - The default field for queries with the field name unspecified + */ + public SpanScorer(Query query, String field, + CachingTokenFilter cachingTokenFilter, String defaultField) throws IOException { + this.defaultField = defaultField.intern(); + init(query, field, cachingTokenFilter, null); + } + + /** + * @param query + * Query to use for highlighting + * @param field + * Field to highlight - pass null to ignore fields + * @param tokenStream + * of source text to be highlighted + * @param reader + * @throws IOException + */ + public SpanScorer(Query query, String field, + CachingTokenFilter cachingTokenFilter, IndexReader reader) + throws IOException { + init(query, field, cachingTokenFilter, reader); + } + + /** + * @param weightedTerms + */ + public SpanScorer(WeightedSpanTerm[] weightedTerms) { + this.fieldWeightedSpanTerms = new HashMap(weightedTerms.length); + + for (int i = 0; i < weightedTerms.length; i++) { + WeightedSpanTerm existingTerm = (WeightedSpanTerm) fieldWeightedSpanTerms.get(weightedTerms[i].term); + + if ((existingTerm == null) || + (existingTerm.weight < weightedTerms[i].weight)) { + // if a term is defined more than once, always use the highest + // scoring weight + fieldWeightedSpanTerms.put(weightedTerms[i].term, weightedTerms[i]); + maxTermWeight = Math.max(maxTermWeight, weightedTerms[i].getWeight()); + } + } + } + + /* + * (non-Javadoc) + * + * @see org.apache.lucene.search.highlight.Scorer#getFragmentScore() + */ + public float getFragmentScore() { + return totalScore; + } + + /** + * + * @return The highest weighted term (useful for passing to + * GradientFormatter to set top end of coloring scale. + */ + public float getMaxTermWeight() { + return maxTermWeight; + } + + /* + * (non-Javadoc) + * + * @see org.apache.lucene.search.highlight.Scorer#getTokenScore(org.apache.lucene.analysis.Token, + * int) + */ + public float getTokenScore(Token token) { + position += token.getPositionIncrement(); + + String termText = token.termText(); + + WeightedSpanTerm weightedSpanTerm; + + if ((weightedSpanTerm = (WeightedSpanTerm) fieldWeightedSpanTerms.get( + termText)) == null) { + return 0; + } + + if (weightedSpanTerm.positionSensitive && + !weightedSpanTerm.checkPosition(position)) { + return 0; + } + + float score = weightedSpanTerm.getWeight(); + + // found a query term - is it unique in this doc? + if (!foundTerms.contains(termText)) { + totalScore += score; + foundTerms.add(termText); + } + + return score; + } + + /** + * Retrieve the WeightedSpanTerm for the specified token. Useful for passing + * Span information to a Fragmenter. + * + * @param token + * @return WeightedSpanTerm for token + */ + public WeightedSpanTerm getWeightedSpanTerm(String token) { + return (WeightedSpanTerm) fieldWeightedSpanTerms.get(token); + } + + /** + * @param query + * @param field + * @param tokenStream + * @param reader + * @throws IOException + */ + private void init(Query query, String field, + CachingTokenFilter cachingTokenFilter, IndexReader reader) + throws IOException { + WeightedSpanTermExtractor qse = defaultField == null ? new WeightedSpanTermExtractor() + : new WeightedSpanTermExtractor(defaultField); + + if (reader == null) { + this.fieldWeightedSpanTerms = qse.getWeightedSpanTerms(query, + cachingTokenFilter, field); + } else { + this.fieldWeightedSpanTerms = qse.getWeightedSpanTermsWithScores(query, + cachingTokenFilter, field, reader); + } + } + + /** + * If you call Highlighter#getBestFragment() more than once you must reset + * the SpanScorer between each call. + */ + public void reset() { + position = -1; + } + + /* + * (non-Javadoc) + * + * @see org.apache.lucene.search.highlight.Scorer#startFragment(org.apache.lucene.search.highlight.TextFragment) + */ + public void startFragment(TextFragment newFragment) { + foundTerms = new HashSet(); + totalScore = 0; + } +} Index: contrib/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTerm.java =================================================================== --- contrib/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTerm.java (revision 0) +++ contrib/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTerm.java (revision 0) @@ -0,0 +1,138 @@ +package org.apache.lucene.search.highlight; + + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; + + +/** + * Lightweight class to hold term, weight, and positions used for scoring this + * term. + */ +public class WeightedSpanTerm { + float weight; // multiplier + String term; // stemmed form + boolean positionSensitive; + private List positionSpans = new ArrayList(); + + /** + * @param weight + * @param term + */ + public WeightedSpanTerm(float weight, String term) { + this.weight = weight; + this.term = term; + this.positionSpans = new ArrayList(); + } + + /** + * @param weight + * @param term + * @param positionSensitive + */ + public WeightedSpanTerm(float weight, String term, boolean positionSensitive) { + this.weight = weight; + this.term = term; + this.positionSensitive = positionSensitive; + } + + /** + * Checks to see if this term is valid at position. + * + * @param position + * to check against valid term postions + * @return true iff this term is a hit at this position + */ + public boolean checkPosition(int position) { + // There would probably be a slight speed improvement if PositionSpans + // where kept in some sort of priority queue - that way this method + // could + // bail early without checking each PositionSpan. + Iterator positionSpanIt = positionSpans.iterator(); + + while (positionSpanIt.hasNext()) { + PositionSpan posSpan = (PositionSpan) positionSpanIt.next(); + + if (((position >= posSpan.start) && (position <= posSpan.end))) { + return true; + } + } + + return false; + } + + /** + * @return the term value (stemmed) + */ + public String getTerm() { + return term; + } + + /** + * @return the weight associated with this term + */ + public float getWeight() { + return weight; + } + + /** + * @param term + * the term value (stemmed) + */ + public void setTerm(String term) { + this.term = term; + } + + /** + * @param weight + * the weight associated with this term + */ + public void setWeight(float weight) { + this.weight = weight; + } + + public void addPositionSpans(List positionSpans) { + this.positionSpans.addAll(positionSpans); + } + + public boolean isPositionSensitive() { + return positionSensitive; + } + + public void setPositionSensitive(boolean positionSensitive) { + this.positionSensitive = positionSensitive; + } + + public List getPositionSpans() { + return positionSpans; + } +} + + +// Utility class to store a Span +class PositionSpan { + int start; + int end; + + public PositionSpan(int start, int end) { + this.start = start; + this.end = end; + } +} Index: contrib/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java =================================================================== --- contrib/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java (revision 0) +++ contrib/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java (revision 0) @@ -0,0 +1,355 @@ +package org.apache.lucene.search.highlight; + + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +import org.apache.lucene.analysis.CachingTokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.memory.MemoryIndex; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.FilteredQuery; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.PhraseQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.spans.SpanNearQuery; +import org.apache.lucene.search.spans.SpanQuery; +import org.apache.lucene.search.spans.SpanTermQuery; +import org.apache.lucene.search.spans.Spans; + +import java.io.IOException; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Set; + + +/** + * Class used to extract {@link WeightedSpanTerm}s from a {@link Query} based + * on whether Terms from the query are contained in a supplied TokenStream. + */ +public class WeightedSpanTermExtractor { + + private String fieldName; + private CachingTokenFilter cachedTokenFilter; + private Map readers = new HashMap(10); + private String defaultField; + + public WeightedSpanTermExtractor () { + } + + public WeightedSpanTermExtractor (String defaultField) { + if (defaultField != null) { + this.defaultField = defaultField.intern(); + } + } + + private void closeReaders() { + Collection readerSet = readers.values(); + Iterator it = readerSet.iterator(); + + while (it.hasNext()) { + + IndexReader reader = (IndexReader) it.next(); + try { + reader.close(); + } catch (IOException e) { + } + } + } + + /** + * Fills a Map with <@link WeightedSpanTerm>s using the terms + * from the supplied Query. + * + * @param query + * Query to extract Terms from + * @param terms + * Map to place created WeightedSpanTerms in + * @throws IOException + */ + private void extract(Query query, Map terms) throws IOException { + if (query instanceof BooleanQuery) { + BooleanClause[] queryClauses = ((BooleanQuery) query).getClauses(); + Map booleanTerms = new HashMap(); + for (int i = 0; i < queryClauses.length; i++) { + if (!queryClauses[i].isProhibited()) { + extract(queryClauses[i].getQuery(), booleanTerms); + } + } + terms.putAll(booleanTerms); + } else if (query instanceof PhraseQuery) { + Term[] phraseQueryTerms = ((PhraseQuery) query).getTerms(); + SpanQuery[] clauses = new SpanQuery[phraseQueryTerms.length]; + for (int i = 0; i < phraseQueryTerms.length; i++) { + clauses[i] = new SpanTermQuery(phraseQueryTerms[i]); + } + + int slop = ((PhraseQuery) query).getSlop(); + boolean inorder = false; + + if (slop == 0) { + inorder = true; + } + + SpanNearQuery sp = new SpanNearQuery(clauses, slop, inorder); + sp.setBoost(query.getBoost()); + extractWeightedSpanTerms(terms, sp); + } else if (query instanceof TermQuery) { + extractWeightedTerms(terms, query); + } else if (query instanceof SpanQuery) { + extractWeightedSpanTerms(terms, (SpanQuery) query); + } else if (query instanceof FilteredQuery) { + extract(((FilteredQuery) query).getQuery(), terms); + } else { + // NO-OP + } + } + + /** + * Fills a Map with <@link WeightedSpanTerm>s using the terms + * from the supplied SpanQuery. + * + * @param terms + * Map to place created WeightedSpanTerms in + * @param spanQuery + * SpanQuery to extract Terms from + * @throws IOException + */ + private void extractWeightedSpanTerms(Map terms, SpanQuery spanQuery) + throws IOException { + Set nonWeightedTerms = new HashSet(); + spanQuery.extractTerms(nonWeightedTerms); + + Set fieldNames; + + if (fieldName == null) { + fieldNames = new HashSet(); + for (Iterator iter = nonWeightedTerms.iterator(); iter.hasNext();) { + Term queryTerm = (Term) iter.next(); + fieldNames.add(queryTerm.field()); + } + } else { + fieldNames = new HashSet(1); + fieldNames.add(fieldName); + } + // To support the use of the default field name + if (defaultField != null) { + fieldNames.add(defaultField); + } + + Iterator it = fieldNames.iterator(); + List spanPositions = new ArrayList(); + + while (it.hasNext()) { + String field = (String) it.next(); + + IndexReader reader = (IndexReader) readers.get(field); + + if (reader == null) { + MemoryIndex indexer = new MemoryIndex(); + indexer.addField(field, cachedTokenFilter); + IndexSearcher searcher = indexer.createSearcher(); + reader = searcher.getIndexReader(); + readers.put(field, reader); + } + + Spans spans = spanQuery.getSpans(reader); + + // collect span positions + while (spans.next()) { + spanPositions.add(new PositionSpan(spans.start(), spans.end() - 1)); + } + + cachedTokenFilter.reset(); + } + + if (spanPositions.size() == 0) { + // no spans found + return; + } + + for (Iterator iter = nonWeightedTerms.iterator(); iter.hasNext();) { + Term queryTerm = (Term) iter.next(); + + if (fieldNameComparator(queryTerm.field())) { + WeightedSpanTerm weightedSpanTerm = (WeightedSpanTerm) terms.get(queryTerm.text()); + + if (weightedSpanTerm == null) { + weightedSpanTerm = new WeightedSpanTerm(spanQuery.getBoost(), + queryTerm.text()); + weightedSpanTerm.addPositionSpans(spanPositions); + weightedSpanTerm.positionSensitive = true; + terms.put(queryTerm.text(), weightedSpanTerm); + } else { + if (spanPositions.size() > 0) { + weightedSpanTerm.addPositionSpans(spanPositions); + weightedSpanTerm.positionSensitive = true; + } + } + } + } + } + + /** + * Fills a Map with <@link WeightedSpanTerm>s using the terms + * from the supplied Query. + * + * @param terms + * Map to place created WeightedSpanTerms in + * @param query + * Query to extract Terms from + * @throws IOException + */ + private void extractWeightedTerms(Map terms, Query query) + throws IOException { + Set nonWeightedTerms = new HashSet(); + query.extractTerms(nonWeightedTerms); + + for (Iterator iter = nonWeightedTerms.iterator(); iter.hasNext();) { + Term queryTerm = (Term) iter.next(); + + if (fieldNameComparator(queryTerm.field())) { + WeightedSpanTerm weightedSpanTerm = new WeightedSpanTerm(query.getBoost(), + queryTerm.text()); + terms.put(queryTerm.text(), weightedSpanTerm); + } + } + } + + /** + * Creates a Map of WeightedSpanTerms from the given + * Query and TokenStream. + * + *

+ * + * @param query + * that caused hit + * @param tokenStream + * of text to be highlighted + * @return + * @throws IOException + */ + public Map getWeightedSpanTerms(Query query, + CachingTokenFilter cachingTokenFilter) throws IOException { + this.fieldName = null; + this.cachedTokenFilter = cachingTokenFilter; + + Map terms = new HashMap(); + extract(query, terms); + closeReaders(); + + return terms; + } + + /** + * Creates a Map of WeightedSpanTerms from the given + * Query and TokenStream. + * + *

+ * + * @param query + * that caused hit + * @param tokenStream + * of text to be highlighted + * @param fieldName + * restricts Term's used based on field name + * @return + * @throws IOException + */ + public Map getWeightedSpanTerms(Query query, + CachingTokenFilter cachingTokenFilter, String fieldName) + throws IOException { + if (fieldName != null) { + this.fieldName = fieldName.intern(); + } + + Map terms = new HashMap(); + this.cachedTokenFilter = cachingTokenFilter; + + extract(query, terms); + closeReaders(); + + return terms; + } + + /** + * Creates a Map of WeightedSpanTerms from the given + * Query and TokenStream. Uses a supplied + * IndexReader to properly weight terms (for gradient + * highlighting). + * + *

+ * + * @param query + * that caused hit + * @param tokenStream + * of text to be highlighted + * @param fieldName + * restricts Term's used based on field name + * @param reader + * to use for scoring + * @return + * @throws IOException + */ + public Map getWeightedSpanTermsWithScores(Query query, + TokenStream tokenStream, String fieldName, IndexReader reader) + throws IOException { + this.fieldName = fieldName; + this.cachedTokenFilter = new CachingTokenFilter(tokenStream); + + Map terms = new HashMap(); + extract(query, terms); + + int totalNumDocs = reader.numDocs(); + Set weightedTerms = terms.keySet(); + Iterator it = weightedTerms.iterator(); + + while (it.hasNext()) { + WeightedSpanTerm weightedSpanTerm = (WeightedSpanTerm) terms.get(it.next()); + int docFreq = reader.docFreq(new Term(fieldName, weightedSpanTerm.term)); + + // IDF algorithm taken from DefaultSimilarity class + float idf = (float) (Math.log((float) totalNumDocs / (double) (docFreq + + 1)) + 1.0); + weightedSpanTerm.weight *= idf; + } + + closeReaders(); + + return terms; + } + + /** + * Necessary to implement matches for queries against defaultField + */ + private boolean fieldNameComparator(String fieldNameToCheck) { + boolean rv = fieldName == null || fieldNameToCheck == fieldName + || fieldNameToCheck == defaultField; + return rv; + } + +} + Index: contrib/highlighter/src/test/org/apache/lucene/search/highlight/SpanHighlighterTest.java =================================================================== --- contrib/highlighter/src/test/org/apache/lucene/search/highlight/SpanHighlighterTest.java (revision 0) +++ contrib/highlighter/src/test/org/apache/lucene/search/highlight/SpanHighlighterTest.java (revision 0) @@ -0,0 +1,1135 @@ +package org.apache.lucene.search.highlight; + + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +import junit.framework.TestCase; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.CachingTokenFilter; +import org.apache.lucene.analysis.LowerCaseTokenizer; +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.WhitespaceAnalyzer; +import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.queryParser.ParseException; +import org.apache.lucene.queryParser.QueryParser; +import org.apache.lucene.search.BooleanClause.Occur; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.FilteredQuery; +import org.apache.lucene.search.Hits; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.MultiSearcher; +import org.apache.lucene.search.PhraseQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.RangeFilter; +import org.apache.lucene.search.Searcher; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.spans.SpanNearQuery; +import org.apache.lucene.search.spans.SpanNotQuery; +import org.apache.lucene.search.spans.SpanQuery; +import org.apache.lucene.search.spans.SpanTermQuery; +import org.apache.lucene.store.RAMDirectory; + +import java.io.IOException; +import java.io.Reader; +import java.io.StringReader; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.StringTokenizer; + + +public class SpanHighlighterTest extends TestCase implements Formatter { + private static final String FIELD_NAME = "contents"; + private static final String DEFAULT_FIELD_NAME = "default_field"; + private IndexReader reader; + private Query query; + RAMDirectory ramDir; + public Searcher searcher = null; + public Hits hits = null; + int numHighlights = 0; + Analyzer analyzer = new StandardAnalyzer(); + String[] texts = { + "Hello this is a piece of text that is very long and contains too much preamble and the meat is really here which says kennedy has been shot", + "This piece of text refers to Kennedy at the beginning then has a longer piece of text that is very long in the middle and finally ends with another reference to Kennedy", + "JFK has been shot", "John Kennedy has been shot", + "This text has a typo in referring to Keneddy", + "wordx wordy wordz wordx wordy wordx worda wordb wordy wordc", + "y z x y z a b" + }; + + + protected void setUp() throws Exception { + ramDir = new RAMDirectory(); + + IndexWriter writer = new IndexWriter(ramDir, new StandardAnalyzer(), true); + + for (int i = 0; i < texts.length; i++) { + addDoc(writer, texts[i]); + } + + writer.optimize(); + writer.close(); + reader = IndexReader.open(ramDir); + numHighlights = 0; + } + + protected void tearDown() throws Exception { + super.tearDown(); + } + + public void testHighlightingWithDefaultField() throws Exception { + + String s1 = "I call our world Flatland, not because we call it so,"; + + QueryParser parser = new QueryParser(DEFAULT_FIELD_NAME, new StandardAnalyzer()); + + // Verify that a query against the default field results in text being highlighted + // regardless of the field name. + Query q = parser.parse("\"world Flatland\"~3"); + String expected = "I call our world Flatland, not because we call it so,"; + String observed = highlightField(q, "SOME_FIELD_NAME", s1); + System.out.println("Expected: \"" + expected + "\n" + "Observed: \"" + observed); + assertEquals("Query in the default field results in text for *ANY* field being highlighted", + expected, observed); + + // Verify that a query against a named field does not result in any highlighting + // when the query field name differs from the name of the field being highlighted, + // which in this example happens to be the default field name. + q = parser.parse("text:\"world Flatland\"~3"); + expected = s1; + observed = highlightField(q, DEFAULT_FIELD_NAME, s1); + System.out.println("Expected: \"" + expected + "\n" + "Observed: \"" + observed); + assertEquals("Query in a named field does not result in highlighting when that field isn't in the query", + s1, highlightField(q, DEFAULT_FIELD_NAME, s1)); + } + + /** + * This method intended for use with testHighlightingWithDefaultField() + */ + private static String highlightField (Query query, String fieldName, String text) + throws IOException { + CachingTokenFilter tokenStream = new CachingTokenFilter( + new StandardAnalyzer().tokenStream(fieldName, new StringReader(text))); + // Assuming "", "" used to highlight + SimpleHTMLFormatter formatter = new SimpleHTMLFormatter(); + Highlighter highlighter = new Highlighter(formatter, + new SpanScorer(query, fieldName, tokenStream, DEFAULT_FIELD_NAME)); + highlighter.setTextFragmenter(new SimpleFragmenter(Integer.MAX_VALUE)); + tokenStream.reset(); + String rv = highlighter.getBestFragments(tokenStream, text, 1, "(FIELD TEXT TRUNCATED)"); + return rv.length() == 0 ? text : rv; + } + + public void testSimpleSpanHighlighter() throws Exception { + doSearching("Kennedy"); + + int maxNumFragmentsRequired = 2; + + for (int i = 0; i < hits.length(); i++) { + String text = hits.doc(i).get(FIELD_NAME); + CachingTokenFilter tokenStream = new CachingTokenFilter(analyzer.tokenStream( + FIELD_NAME, new StringReader(text))); + Highlighter highlighter = new Highlighter(new SpanScorer(query, + FIELD_NAME, tokenStream)); + highlighter.setTextFragmenter(new SimpleFragmenter(40)); + tokenStream.reset(); + + String result = highlighter.getBestFragments(tokenStream, text, + maxNumFragmentsRequired, "..."); + System.out.println("\t" + result); + } + + // Not sure we can assert anything here - just running to check we dont throw any exceptions + } + + public void testSimpleSpanPhraseHighlighting() throws Exception { + doSearching("\"very long and contains\""); + + int maxNumFragmentsRequired = 2; + + for (int i = 0; i < hits.length(); i++) { + String text = hits.doc(i).get(FIELD_NAME); + CachingTokenFilter tokenStream = new CachingTokenFilter(analyzer.tokenStream( + FIELD_NAME, new StringReader(text))); + Highlighter highlighter = new Highlighter(this, + new SpanScorer(query, FIELD_NAME, tokenStream)); + highlighter.setTextFragmenter(new SimpleFragmenter(40)); + tokenStream.reset(); + + String result = highlighter.getBestFragments(tokenStream, text, + maxNumFragmentsRequired, "..."); + System.out.println("\t" + result); + } + + assertTrue("Failed to find correct number of highlights " + numHighlights + + " found", numHighlights == 3); + } + + public void testSimpleSpanPhraseHighlighting2() throws Exception { + doSearching("\"text piece long\"~5"); + + int maxNumFragmentsRequired = 2; + + for (int i = 0; i < hits.length(); i++) { + String text = hits.doc(i).get(FIELD_NAME); + CachingTokenFilter tokenStream = new CachingTokenFilter(analyzer.tokenStream( + FIELD_NAME, new StringReader(text))); + Highlighter highlighter = new Highlighter(this, + new SpanScorer(query, FIELD_NAME, tokenStream)); + highlighter.setTextFragmenter(new SimpleFragmenter(40)); + tokenStream.reset(); + + String result = highlighter.getBestFragments(tokenStream, text, + maxNumFragmentsRequired, "..."); + System.out.println("\t" + result); + } + + assertTrue("Failed to find correct number of highlights " + numHighlights + + " found", numHighlights == 6); + } + + public void testSimpleSpanPhraseHighlighting3() throws Exception { + doSearching("\"x y z\""); + + int maxNumFragmentsRequired = 2; + + for (int i = 0; i < hits.length(); i++) { + String text = hits.doc(i).get(FIELD_NAME); + CachingTokenFilter tokenStream = new CachingTokenFilter(analyzer.tokenStream( + FIELD_NAME, new StringReader(text))); + Highlighter highlighter = new Highlighter(this, + new SpanScorer(query, FIELD_NAME, tokenStream)); + highlighter.setTextFragmenter(new SimpleFragmenter(40)); + tokenStream.reset(); + + String result = highlighter.getBestFragments(tokenStream, text, + maxNumFragmentsRequired, "..."); + System.out.println("\t" + result); + + assertTrue("Failed to find correct number of highlights " + numHighlights + + " found", numHighlights == 3); + } + } + + public void testGetBestFragmentsSimpleQuery() throws Exception { + doSearching("Kennedy"); + doStandardHighlights(); + assertTrue("Failed to find correct number of highlights " + numHighlights + + " found", numHighlights == 4); + } + + public void testNearSpanSimpleQuery() throws Exception { + doSearching(new SpanNearQuery( + new SpanQuery[] { + new SpanTermQuery(new Term(FIELD_NAME, "beginning")), + new SpanTermQuery(new Term(FIELD_NAME, "kennedy")) + }, 3, false)); + doStandardHighlights(); + assertTrue("Failed to find correct number of highlights " + numHighlights + + " found", numHighlights == 2); + } + + public void testSpanHighlighting() throws Exception { + Query query1 = new SpanNearQuery(new SpanQuery[] { + new SpanTermQuery(new Term(FIELD_NAME, "wordx")), + new SpanTermQuery(new Term(FIELD_NAME, "wordy")) + }, 1, false); + Query query2 = new SpanNearQuery(new SpanQuery[] { + new SpanTermQuery(new Term(FIELD_NAME, "wordy")), + new SpanTermQuery(new Term(FIELD_NAME, "wordc")) + }, 1, false); + BooleanQuery bquery = new BooleanQuery(); + bquery.add(query1, Occur.SHOULD); + bquery.add(query2, Occur.SHOULD); + doSearching(bquery); + doStandardHighlights(); + assertTrue("Failed to find correct number of highlights " + numHighlights + + " found", numHighlights == 7); + } + + public void testNotSpanSimpleQuery() throws Exception { + doSearching(new SpanNotQuery( + new SpanNearQuery( + new SpanQuery[] { + new SpanTermQuery(new Term(FIELD_NAME, "shot")), + new SpanTermQuery(new Term(FIELD_NAME, "kennedy")) + }, 3, false), new SpanTermQuery(new Term(FIELD_NAME, "john")))); + doStandardHighlights(); + assertTrue("Failed to find correct number of highlights " + numHighlights + + " found", numHighlights == 4); + } + + public void testGetFuzzyFragments() throws Exception { + doSearching("Kinnedy~"); + doStandardHighlights(); + assertTrue("Failed to find correct number of highlights " + numHighlights + + " found", numHighlights == 5); + } + + public void testGetWildCardFragments() throws Exception { + doSearching("K?nnedy"); + doStandardHighlights(); + assertTrue("Failed to find correct number of highlights " + numHighlights + + " found", numHighlights == 4); + } + + public void testGetMidWildCardFragments() throws Exception { + doSearching("K*dy"); + doStandardHighlights(); + assertTrue("Failed to find correct number of highlights " + numHighlights + + " found", numHighlights == 5); + } + + public void testGetRangeFragments() throws Exception { + doSearching(FIELD_NAME + ":[kannedy TO kznnedy]"); //bug?needs lower case + doStandardHighlights(); + assertTrue("Failed to find correct number of highlights " + numHighlights + + " found", numHighlights == 5); + } + + public void testGetBestFragmentsPhrase() throws Exception { + doSearching("\"John Kennedy\""); + doStandardHighlights(); + //Currently highlights "John" and "Kennedy" separately + assertTrue("Failed to find correct number of highlights " + numHighlights + + " found", numHighlights == 2); + } + + public void testGetBestFragmentsSpan() throws Exception { + SpanQuery[] clauses = { + new SpanTermQuery(new Term("contents", "john")), + new SpanTermQuery(new Term("contents", "kennedy")), + }; + + SpanNearQuery snq = new SpanNearQuery(clauses, 1, true); + doSearching(snq); + doStandardHighlights(); + //Currently highlights "John" and "Kennedy" separately + assertTrue("Failed to find correct number of highlights " + numHighlights + + " found", numHighlights == 2); + } + + public void testOffByOne() throws IOException { + String text = "help me [54-65]"; + String field = "data"; + TermQuery query = new TermQuery(new Term(field, "help")); + TokenStream tokenStream = new StandardAnalyzer().tokenStream(field, + new StringReader(text)); + + Highlighter hg = new Highlighter(new SimpleHTMLFormatter(), + new SpanScorer(query, field, new CachingTokenFilter(tokenStream))); + hg.setTextFragmenter(new NullFragmenter()); + + String match = null; + match = hg.getBestFragment(new StandardAnalyzer(), field, text); + assertEquals("help me [54-65]", match); + } + + public void testGetBestFragmentsFilteredQuery() throws Exception { + RangeFilter rf = new RangeFilter("contents", "john", "john", true, true); + SpanQuery[] clauses = { + new SpanTermQuery(new Term("contents", "john")), + new SpanTermQuery(new Term("contents", "kennedy")), + }; + SpanNearQuery snq = new SpanNearQuery(clauses, 1, true); + FilteredQuery fq = new FilteredQuery(snq, rf); + + doSearching(fq); + doStandardHighlights(); + //Currently highlights "John" and "Kennedy" separately + assertTrue("Failed to find correct number of highlights " + numHighlights + + " found", numHighlights == 2); + } + + public void testGetBestFragmentsFilteredPhraseQuery() + throws Exception { + RangeFilter rf = new RangeFilter("contents", "john", "john", true, true); + PhraseQuery pq = new PhraseQuery(); + pq.add(new Term("contents", "john")); + pq.add(new Term("contents", "kennedy")); + + FilteredQuery fq = new FilteredQuery(pq, rf); + + doSearching(fq); + doStandardHighlights(); + //Currently highlights "John" and "Kennedy" separately + assertTrue("Failed to find correct number of highlights " + numHighlights + + " found", numHighlights == 2); + } + + public void testGetBestFragmentsMultiTerm() throws Exception { + doSearching("John Kenn*"); + doStandardHighlights(); + assertTrue("Failed to find correct number of highlights " + numHighlights + + " found", numHighlights == 5); + } + + public void testGetBestFragmentsWithOr() throws Exception { + doSearching("JFK OR Kennedy"); + doStandardHighlights(); + assertTrue("Failed to find correct number of highlights " + numHighlights + + " found", numHighlights == 5); + } + + public void testGetBestSingleFragment() throws Exception { + doSearching("Kennedy"); + + for (int i = 0; i < hits.length(); i++) { + String text = hits.doc(i).get(FIELD_NAME); + + CachingTokenFilter tokenStream = new CachingTokenFilter(analyzer.tokenStream( + FIELD_NAME, new StringReader(text))); + Highlighter highlighter = new Highlighter(this, + new SpanScorer(query, FIELD_NAME, tokenStream)); + tokenStream.reset(); + highlighter.setTextFragmenter(new SimpleFragmenter(40)); + + String result = highlighter.getBestFragment(tokenStream, text); + System.out.println("\t" + result); + } + + assertTrue("Failed to find correct number of highlights " + numHighlights + + " found", numHighlights == 4); + + numHighlights = 0; + + for (int i = 0; i < hits.length(); i++) { + String text = hits.doc(i).get(FIELD_NAME); + CachingTokenFilter tokenStream = new CachingTokenFilter(analyzer.tokenStream( + FIELD_NAME, new StringReader(text))); + Highlighter highlighter = new Highlighter(this, + new SpanScorer(query, FIELD_NAME, tokenStream)); + tokenStream.reset(); + highlighter.setTextFragmenter(new SimpleFragmenter(40)); + + highlighter.getBestFragment(analyzer, FIELD_NAME, text); + } + + assertTrue("Failed to find correct number of highlights " + numHighlights + + " found", numHighlights == 4); + + numHighlights = 0; + + for (int i = 0; i < hits.length(); i++) { + String text = hits.doc(i).get(FIELD_NAME); + CachingTokenFilter tokenStream = new CachingTokenFilter(analyzer.tokenStream( + FIELD_NAME, new StringReader(text))); + Highlighter highlighter = new Highlighter(this, + new SpanScorer(query, FIELD_NAME, tokenStream)); + tokenStream.reset(); + highlighter.getBestFragments(analyzer, FIELD_NAME, text, 10); + } + + assertTrue("Failed to find correct number of highlights " + numHighlights + + " found", numHighlights == 4); + } + + public void testGetBestSingleFragmentWithWeights() throws Exception { + WeightedSpanTerm[] wTerms = new WeightedSpanTerm[2]; + wTerms[0] = new WeightedSpanTerm(10f, "hello"); + + List positionSpans = new ArrayList(); + positionSpans.add(new PositionSpan(0, 0)); + wTerms[0].addPositionSpans(positionSpans); + + wTerms[1] = new WeightedSpanTerm(1f, "kennedy"); + positionSpans = new ArrayList(); + positionSpans.add(new PositionSpan(14, 14)); + wTerms[1].addPositionSpans(positionSpans); + + TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, + new StringReader(texts[0])); + Highlighter highlighter = new Highlighter(new SpanScorer(wTerms)); + + highlighter.setTextFragmenter(new SimpleFragmenter(2)); + + String result = highlighter.getBestFragment(tokenStream, texts[0]).trim(); + assertTrue("Failed to find best section using weighted terms. Found: [" + + result + "]", "Hello".equals(result)); + + //readjust weights + wTerms[1].setWeight(50f); + tokenStream = analyzer.tokenStream(FIELD_NAME, new StringReader(texts[0])); + highlighter = new Highlighter(new SpanScorer(wTerms)); + highlighter.setTextFragmenter(new SimpleFragmenter(2)); + + result = highlighter.getBestFragment(tokenStream, texts[0]).trim(); + assertTrue("Failed to find best section using weighted terms. Found: " + + result, "kennedy".equals(result)); + } + + public void testGetBestSingleFragmentWithWeights2() throws Exception { + doSearching("refers kennedy"); + + CachingTokenFilter tokenStream = new CachingTokenFilter(analyzer.tokenStream( + FIELD_NAME, new StringReader(texts[1]))); + Highlighter highlighter = new Highlighter(new SpanScorer(query, FIELD_NAME, + tokenStream, reader)); + tokenStream.reset(); + highlighter.setTextFragmenter(new SimpleFragmenter(2)); + + String result = highlighter.getBestFragment(tokenStream, texts[1]).trim(); + assertTrue("Failed to find best section using weighted terms. Found: [" + + result + "]", "refers".equals(result)); + } + + // tests a "complex" analyzer that produces multiple + // overlapping tokens + public void testOverlapAnalyzer() throws Exception { + HashMap synonyms = new HashMap(); + synonyms.put("football", "soccer,footie"); + + Analyzer analyzer = new SynonymAnalyzer(synonyms); + String srchkey = "football"; + + String s = "football-soccer in the euro 2004 footie competition"; + QueryParser parser = new QueryParser("bookid", analyzer); + Query query = parser.parse(srchkey); + + CachingTokenFilter tokenStream = new CachingTokenFilter(analyzer.tokenStream( + null, new StringReader(s))); + + Highlighter highlighter = new Highlighter(this, + new SpanScorer(query, "bookid", tokenStream)); + tokenStream.reset(); + + // Get 3 best fragments and seperate with a "..." + String result = highlighter.getBestFragments(tokenStream, s, 3, "..."); + String expectedResult = "football-soccer in the euro 2004 footie competition"; + assertEquals(expectedResult, result); + } + + public void testGetSimpleHighlight() throws Exception { + doSearching("Kennedy"); + + for (int i = 0; i < hits.length(); i++) { + String text = hits.doc(i).get(FIELD_NAME); + CachingTokenFilter tokenStream = new CachingTokenFilter(analyzer.tokenStream( + FIELD_NAME, new StringReader(text))); + + Highlighter highlighter = new Highlighter(this, + new SpanScorer(query, FIELD_NAME, tokenStream)); + tokenStream.reset(); + + String result = highlighter.getBestFragment(tokenStream, text); + System.out.println("\t" + result); + } + + assertTrue("Failed to find correct number of highlights " + numHighlights + + " found", numHighlights == 4); + } + + public void testGetTextFragments() throws Exception { + doSearching("Kennedy"); + + for (int i = 0; i < hits.length(); i++) { + String text = hits.doc(i).get(FIELD_NAME); + CachingTokenFilter tokenStream = new CachingTokenFilter(analyzer.tokenStream( + FIELD_NAME, new StringReader(text))); + + SpanScorer spanScorer = new SpanScorer(query, FIELD_NAME, tokenStream); + Highlighter highlighter = new Highlighter(this, spanScorer); + highlighter.setTextFragmenter(new SimpleFragmenter(20)); + + tokenStream.reset(); + + String[] stringResults = highlighter.getBestFragments(tokenStream, text, + 10); + tokenStream.reset(); + spanScorer.reset(); + + //tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(text)); + TextFragment[] fragmentResults = highlighter.getBestTextFragments(tokenStream, + text, true, 10); + + assertTrue("Failed to find correct number of text Fragments: " + + fragmentResults.length + " vs " + stringResults.length, + fragmentResults.length == stringResults.length); + + for (int j = 0; j < stringResults.length; j++) { + System.out.println(fragmentResults[j]); + assertTrue("Failed to find same text Fragments: " + fragmentResults[j] + + " found", fragmentResults[j].toString().equals(stringResults[j])); + } + } + } + + public void testGetFragmentsSimpleSpanFragmenter() throws Exception { + doSearching(new SpanNearQuery( + new SpanQuery[] { + new SpanTermQuery(new Term(FIELD_NAME, "shot")), + new SpanTermQuery(new Term(FIELD_NAME, "kennedy")) + }, 3, false)); + + for (int i = 0; i < hits.length(); i++) { + String text = hits.doc(i).get(FIELD_NAME); + + CachingTokenFilter tokenStream = new CachingTokenFilter(analyzer.tokenStream( + FIELD_NAME, new StringReader(text))); + + SpanScorer spanScorer = new SpanScorer(query, FIELD_NAME, tokenStream); + Highlighter highlighter = new Highlighter(this, spanScorer); + highlighter.setTextFragmenter(new SimpleSpanFragmenter(spanScorer, 1)); + + tokenStream.reset(); + + String[] stringResults = highlighter.getBestFragments(tokenStream, text, 1); + + if (i == 0) { + System.out.println(stringResults[0]); + + assertEquals("Kennedy has been shot", + stringResults[0].trim()); + } + + if (i == 1) { + System.out.println(stringResults[0]); + + assertEquals("kennedy has been shot", + stringResults[0].trim()); + } + } + + doSearching("\"piece of text that is very long\""); + + for (int i = 0; i < hits.length(); i++) { + String text = hits.doc(i).get(FIELD_NAME); + + CachingTokenFilter tokenStream = new CachingTokenFilter(analyzer.tokenStream( + FIELD_NAME, new StringReader(text))); + + SpanScorer spanScorer = new SpanScorer(query, FIELD_NAME, tokenStream); + Highlighter highlighter = new Highlighter(this, spanScorer); + highlighter.setTextFragmenter(new SimpleSpanFragmenter(spanScorer, 1)); + + tokenStream.reset(); + + String[] stringResults = highlighter.getBestFragments(tokenStream, text, 1); + + if (i == 0) { + System.out.println(stringResults[0]); + + assertEquals("this is a piece of text that is very long", + stringResults[0].trim()); + } + } + } + + public void testMaxSizeHighlight() throws Exception { + doSearching("meat"); + + CachingTokenFilter tokenStream = new CachingTokenFilter(analyzer.tokenStream( + FIELD_NAME, new StringReader(texts[0]))); + + Highlighter highlighter = new Highlighter(this, + new SpanScorer(query, FIELD_NAME, tokenStream)); + highlighter.setMaxDocBytesToAnalyze(30); + tokenStream.reset(); + highlighter.getBestFragment(tokenStream, texts[0]); + assertTrue("Setting MaxDocBytesToAnalyze should have prevented " + + "us from finding matches for this record: " + numHighlights + " found", + numHighlights == 0); + } + + public void testMaxSizeHighlightTruncates() throws IOException { + String goodWord = "goodtoken"; + String[] stopWords = { "stoppedtoken" }; + + TermQuery query = new TermQuery(new Term("data", goodWord)); + SimpleHTMLFormatter fm = new SimpleHTMLFormatter(); + + String field = "data"; + + String match = null; + StringBuffer sb = new StringBuffer(); + sb.append(goodWord); + + for (int i = 0; i < 10000; i++) { + sb.append(" "); + sb.append(stopWords[0]); + } + + TokenStream tokenStream = new StandardAnalyzer(stopWords).tokenStream(field, + new StringReader(sb.toString())); + CachingTokenFilter ctf = new CachingTokenFilter(tokenStream); + SpanScorer spanScorer = new SpanScorer(query, field, ctf); + Highlighter hg = new Highlighter(this, spanScorer); + hg.setTextFragmenter(new NullFragmenter()); + + hg.setMaxDocBytesToAnalyze(100); + match = hg.getBestFragment(new StandardAnalyzer(stopWords), field, + sb.toString()); + assertTrue("Matched text should be no more than 100 chars in length ", + match.length() < hg.getMaxDocBytesToAnalyze()); + + spanScorer.reset(); + //add another tokenized word to the overrall length - but set way beyond + //the length of text under consideration (after a large slug of stop words + whitespace) + sb.append(" "); + sb.append(goodWord); + match = hg.getBestFragment(new StandardAnalyzer(stopWords), "data", + sb.toString()); + assertTrue("Matched text should be no more than 100 chars in length ", + match.length() < hg.getMaxDocBytesToAnalyze()); + } + + public void testUnRewrittenQuery() throws IOException, ParseException { + //test to show how rewritten query can still be used + searcher = new IndexSearcher(ramDir); + + Analyzer analyzer = new StandardAnalyzer(); + + QueryParser parser = new QueryParser(FIELD_NAME, analyzer); + Query query = parser.parse("JF? or Kenned*"); + System.out.println("Searching with primitive query"); + + //forget to set this and... + //query=query.rewrite(reader); + Hits hits = searcher.search(query); + + //create an instance of the highlighter with the tags used to surround highlighted text + // QueryHighlightExtractor highlighter = new QueryHighlightExtractor(this, query, new StandardAnalyzer()); + int maxNumFragmentsRequired = 3; + + for (int i = 0; i < hits.length(); i++) { + String text = hits.doc(i).get(FIELD_NAME); + CachingTokenFilter tokenStream = new CachingTokenFilter(analyzer.tokenStream( + FIELD_NAME, new StringReader(text))); + Highlighter highlighter = new Highlighter(this, + new SpanScorer(query, FIELD_NAME, tokenStream)); + highlighter.setTextFragmenter(new SimpleFragmenter(40)); + tokenStream.reset(); + + String highlightedText = highlighter.getBestFragments(tokenStream, text, + maxNumFragmentsRequired, "..."); + System.out.println(highlightedText); + } + + //We expect to have zero highlights if the query is multi-terms and is not rewritten! + assertTrue("Failed to find correct number of highlights " + numHighlights + + " found", numHighlights == 0); + } + + public void testNoFragments() throws Exception { + doSearching("AnInvalidQueryWhichShouldYieldNoResults"); + + for (int i = 0; i < texts.length; i++) { + String text = texts[i]; + CachingTokenFilter tokenStream = new CachingTokenFilter(analyzer.tokenStream( + FIELD_NAME, new StringReader(text))); + Highlighter highlighter = new Highlighter(this, + new SpanScorer(query, FIELD_NAME, tokenStream)); + tokenStream.reset(); + + String result = highlighter.getBestFragment(tokenStream, text); + assertNull("The highlight result should be null for text with no query terms", + result); + } + } + + public void testMultiSearcher() throws Exception { + //setup index 1 + RAMDirectory ramDir1 = new RAMDirectory(); + IndexWriter writer1 = new IndexWriter(ramDir1, new StandardAnalyzer(), true); + Document d = new Document(); + Field f = new Field(FIELD_NAME, "multiOne", Field.Store.YES, + Field.Index.TOKENIZED); + d.add(f); + writer1.addDocument(d); + writer1.optimize(); + writer1.close(); + + IndexReader reader1 = IndexReader.open(ramDir1); + + //setup index 2 + RAMDirectory ramDir2 = new RAMDirectory(); + IndexWriter writer2 = new IndexWriter(ramDir2, new StandardAnalyzer(), true); + d = new Document(); + f = new Field(FIELD_NAME, "multiTwo", Field.Store.YES, Field.Index.TOKENIZED); + d.add(f); + writer2.addDocument(d); + writer2.optimize(); + writer2.close(); + + IndexReader reader2 = IndexReader.open(ramDir2); + + IndexSearcher[] searchers = new IndexSearcher[2]; + searchers[0] = new IndexSearcher(ramDir1); + searchers[1] = new IndexSearcher(ramDir2); + + MultiSearcher multiSearcher = new MultiSearcher(searchers); + QueryParser parser = new QueryParser(FIELD_NAME, new StandardAnalyzer()); + query = parser.parse("multi*"); + System.out.println("Searching for: " + query.toString(FIELD_NAME)); + //at this point the multisearcher calls combine(query[]) + hits = multiSearcher.search(query); + + //query = QueryParser.parse("multi*", FIELD_NAME, new StandardAnalyzer()); + Query[] expandedQueries = new Query[2]; + expandedQueries[0] = query.rewrite(reader1); + expandedQueries[1] = query.rewrite(reader2); + query = query.combine(expandedQueries); + + //create an instance of the highlighter with the tags used to surround highlighted text + for (int i = 0; i < hits.length(); i++) { + String text = hits.doc(i).get(FIELD_NAME); + CachingTokenFilter tokenStream = new CachingTokenFilter(analyzer.tokenStream( + FIELD_NAME, new StringReader(text))); + Highlighter highlighter = new Highlighter(this, + new SpanScorer(query, FIELD_NAME, tokenStream)); + tokenStream.reset(); + + String highlightedText = highlighter.getBestFragment(tokenStream, text); + System.out.println(highlightedText); + } + + assertTrue("Failed to find correct number of highlights " + numHighlights + + " found", numHighlights == 2); + } + + public void testFieldSpecificHighlighting() + throws IOException, ParseException { + String docMainText = "fred is one of the people"; + QueryParser parser = new QueryParser(FIELD_NAME, analyzer); + Query query = parser.parse("fred category:people"); + + //highlighting respects fieldnames used in query + TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, + new StringReader(docMainText)); + CachingTokenFilter ctf = new CachingTokenFilter(tokenStream); + SpanScorer fieldSpecificScorer = new SpanScorer(query, FIELD_NAME, ctf); + Highlighter fieldSpecificHighlighter = new Highlighter(new SimpleHTMLFormatter(), + fieldSpecificScorer); + fieldSpecificHighlighter.setTextFragmenter(new NullFragmenter()); + + String result = fieldSpecificHighlighter.getBestFragment(analyzer, + "contents", docMainText); + assertEquals("Should match", "fred is one of the people", result); + + //highlighting does not respect fieldnames used in query + tokenStream = analyzer.tokenStream(null, new StringReader(docMainText)); + ctf = new CachingTokenFilter(tokenStream); + + SpanScorer fieldInSpecificScorer = new SpanScorer(query, null, ctf); + Highlighter fieldInSpecificHighlighter = new Highlighter(new SimpleHTMLFormatter(), + fieldInSpecificScorer); + fieldInSpecificHighlighter.setTextFragmenter(new NullFragmenter()); + result = fieldInSpecificHighlighter.getBestFragment(analyzer, FIELD_NAME, + docMainText); + assertEquals("Should match", "fred is one of the people", + result); + + reader.close(); + } + + protected TokenStream getTS2() { + //String s = "Hi-Speed10 foo"; + return new TokenStream() { + Iterator iter; + List lst; + + { + lst = new ArrayList(); + + Token t; + t = new Token("hi", 0, 2); + lst.add(t); + t = new Token("hispeed", 0, 8); + lst.add(t); + t = new Token("speed", 3, 8); + t.setPositionIncrement(0); + lst.add(t); + t = new Token("10", 8, 10); + lst.add(t); + t = new Token("foo", 11, 14); + lst.add(t); + iter = lst.iterator(); + } + + public Token next() throws IOException { + return iter.hasNext() ? (Token) iter.next() : null; + } + }; + } + + // same token-stream as above, but the bigger token comes first this time + protected TokenStream getTS2a() { + //String s = "Hi-Speed10 foo"; + return new TokenStream() { + Iterator iter; + List lst; + + { + lst = new ArrayList(); + + Token t; + t = new Token("hispeed", 0, 8); + lst.add(t); + t = new Token("hi", 0, 2); + t.setPositionIncrement(0); + lst.add(t); + t = new Token("speed", 3, 8); + lst.add(t); + t = new Token("10", 8, 10); + lst.add(t); + t = new Token("foo", 11, 14); + lst.add(t); + iter = lst.iterator(); + } + + public Token next() throws IOException { + return iter.hasNext() ? (Token) iter.next() : null; + } + }; + } + + public void testOverlapAnalyzer2() throws Exception { + String s = "Hi-Speed10 foo"; + + Query query; + Highlighter highlighter; + String result; + + query = new QueryParser("text", new WhitespaceAnalyzer()).parse("foo"); + highlighter = new Highlighter(this, + new SpanScorer(query, "text", new CachingTokenFilter(getTS2()))); + result = highlighter.getBestFragments(getTS2(), s, 3, "..."); + assertEquals("Hi-Speed10 foo", result); + + query = new QueryParser("text", new WhitespaceAnalyzer()).parse("10"); + highlighter = new Highlighter(this, + new SpanScorer(query, "text", new CachingTokenFilter(getTS2()))); + result = highlighter.getBestFragments(getTS2(), s, 3, "..."); + assertEquals("Hi-Speed10 foo", result); + + query = new QueryParser("text", new WhitespaceAnalyzer()).parse("hi"); + highlighter = new Highlighter(this, + new SpanScorer(query, "text", new CachingTokenFilter(getTS2()))); + result = highlighter.getBestFragments(getTS2(), s, 3, "..."); + assertEquals("Hi-Speed10 foo", result); + + query = new QueryParser("text", new WhitespaceAnalyzer()).parse("speed"); + highlighter = new Highlighter(this, + new SpanScorer(query, "text", new CachingTokenFilter(getTS2()))); + result = highlighter.getBestFragments(getTS2(), s, 3, "..."); + assertEquals("Hi-Speed10 foo", result); + + query = new QueryParser("text", new WhitespaceAnalyzer()).parse("hispeed"); + highlighter = new Highlighter(this, + new SpanScorer(query, "text", new CachingTokenFilter(getTS2()))); + result = highlighter.getBestFragments(getTS2(), s, 3, "..."); + assertEquals("Hi-Speed10 foo", result); + + query = new QueryParser("text", new WhitespaceAnalyzer()).parse("hi speed"); + highlighter = new Highlighter(this, + new SpanScorer(query, "text", new CachingTokenFilter(getTS2()))); + result = highlighter.getBestFragments(getTS2(), s, 3, "..."); + assertEquals("Hi-Speed10 foo", result); + + /////////////////// same tests, just put the bigger overlapping token first + query = new QueryParser("text", new WhitespaceAnalyzer()).parse("foo"); + highlighter = new Highlighter(this, + new SpanScorer(query, "text", new CachingTokenFilter(getTS2a()))); + result = highlighter.getBestFragments(getTS2a(), s, 3, "..."); + assertEquals("Hi-Speed10 foo", result); + + query = new QueryParser("text", new WhitespaceAnalyzer()).parse("10"); + highlighter = new Highlighter(this, + new SpanScorer(query, "text", new CachingTokenFilter(getTS2a()))); + result = highlighter.getBestFragments(getTS2a(), s, 3, "..."); + assertEquals("Hi-Speed10 foo", result); + + query = new QueryParser("text", new WhitespaceAnalyzer()).parse("hi"); + highlighter = new Highlighter(this, + new SpanScorer(query, "text", new CachingTokenFilter(getTS2a()))); + result = highlighter.getBestFragments(getTS2a(), s, 3, "..."); + assertEquals("Hi-Speed10 foo", result); + + query = new QueryParser("text", new WhitespaceAnalyzer()).parse("speed"); + highlighter = new Highlighter(this, + new SpanScorer(query, "text", new CachingTokenFilter(getTS2a()))); + result = highlighter.getBestFragments(getTS2a(), s, 3, "..."); + assertEquals("Hi-Speed10 foo", result); + + query = new QueryParser("text", new WhitespaceAnalyzer()).parse("hispeed"); + highlighter = new Highlighter(this, + new SpanScorer(query, "text", new CachingTokenFilter(getTS2a()))); + result = highlighter.getBestFragments(getTS2a(), s, 3, "..."); + assertEquals("Hi-Speed10 foo", result); + + query = new QueryParser("text", new WhitespaceAnalyzer()).parse("hi speed"); + highlighter = new Highlighter(this, + new SpanScorer(query, "text", new CachingTokenFilter(getTS2a()))); + result = highlighter.getBestFragments(getTS2a(), s, 3, "..."); + assertEquals("Hi-Speed10 foo", result); + } + + public void doSearching(String queryString) throws Exception { + QueryParser parser = new QueryParser(FIELD_NAME, new StandardAnalyzer()); + parser.setUseOldRangeQuery(true); + query = parser.parse(queryString); + doSearching(query); + } + + public void doSearching(Query unReWrittenQuery) throws Exception { + searcher = new IndexSearcher(ramDir); + //for any multi-term queries to work (prefix, wildcard, range,fuzzy etc) you must use a rewritten query! + query = unReWrittenQuery.rewrite(reader); + System.out.println("Searching for: " + query.toString(FIELD_NAME)); + hits = searcher.search(query); + } + + void doStandardHighlights() throws Exception { + for (int i = 0; i < hits.length(); i++) { + String text = hits.doc(i).get(FIELD_NAME); + int maxNumFragmentsRequired = 2; + String fragmentSeparator = "..."; + CachingTokenFilter tokenStream = new CachingTokenFilter(analyzer.tokenStream( + FIELD_NAME, new StringReader(text))); + Highlighter highlighter = new Highlighter(this, + new SpanScorer(query, FIELD_NAME, tokenStream)); + tokenStream.reset(); + highlighter.setTextFragmenter(new SimpleFragmenter(20)); + + String result = highlighter.getBestFragments(tokenStream, text, + maxNumFragmentsRequired, fragmentSeparator); + System.out.println("\t" + result); + } + } + + void doStandardSpanHighlights() throws Exception { + for (int i = 0; i < hits.length(); i++) { + String text = hits.doc(i).get(FIELD_NAME); + int maxNumFragmentsRequired = 2; + String fragmentSeparator = "..."; + CachingTokenFilter tokenStream = new CachingTokenFilter(analyzer.tokenStream( + FIELD_NAME, new StringReader(text))); + Highlighter highlighter = new Highlighter(this, + new SpanScorer(query, FIELD_NAME, tokenStream)); + tokenStream.reset(); + highlighter.setTextFragmenter(new SimpleFragmenter(20)); + + String result = highlighter.getBestFragments(tokenStream, text, + maxNumFragmentsRequired, fragmentSeparator); + System.out.println("\t" + result); + } + } + + private void addDoc(IndexWriter writer, String text) + throws IOException { + Document d = new Document(); + Field f = new Field(FIELD_NAME, text, Field.Store.YES, Field.Index.TOKENIZED); + d.add(f); + writer.addDocument(d); + } + + public String highlightTerm(String originalText, TokenGroup group) { + if (group.getTotalScore() <= 0) { + return originalText; + } + + numHighlights++; //update stats used in assertions + + return "" + originalText + ""; + } + + // =================================================================== + // ========== BEGIN TEST SUPPORTING CLASSES + // ========== THESE LOOK LIKE, WITH SOME MORE EFFORT THESE COULD BE + // ========== MADE MORE GENERALLY USEFUL. + // TODO - make synonyms all interchangeable with each other and produce + // a version that does hyponyms - the "is a specialised type of ...." + // so that car = audi, bmw and volkswagen but bmw != audi so different + // behaviour to synonyms + // =================================================================== + class SynonymAnalyzer extends Analyzer { + private Map synonyms; + + public SynonymAnalyzer(Map synonyms) { + this.synonyms = synonyms; + } + + /* (non-Javadoc) + * @see org.apache.lucene.analysis.Analyzer#tokenStream(java.lang.String, java.io.Reader) + */ + public TokenStream tokenStream(String arg0, Reader arg1) { + return new SynonymTokenizer(new LowerCaseTokenizer(arg1), synonyms); + } + } + + /** + * Expands a token stream with synonyms (TODO - make the synonyms analyzed by choice of analyzer) + * @author MAHarwood + */ + class SynonymTokenizer extends TokenStream { + private TokenStream realStream; + private Token currentRealToken = null; + private Map synonyms; + StringTokenizer st = null; + + public SynonymTokenizer(TokenStream realStream, Map synonyms) { + this.realStream = realStream; + this.synonyms = synonyms; + } + + public Token next() throws IOException { + if (currentRealToken == null) { + Token nextRealToken = realStream.next(); + + if (nextRealToken == null) { + return null; + } + + String expansions = (String) synonyms.get(nextRealToken.termText()); + + if (expansions == null) { + return nextRealToken; + } + + st = new StringTokenizer(expansions, ","); + + if (st.hasMoreTokens()) { + currentRealToken = nextRealToken; + } + + return currentRealToken; + } else { + String nextExpandedValue = st.nextToken(); + Token expandedToken = new Token(nextExpandedValue, + currentRealToken.startOffset(), currentRealToken.endOffset()); + expandedToken.setPositionIncrement(0); + + if (!st.hasMoreTokens()) { + currentRealToken = null; + st = null; + } + + return expandedToken; + } + } + } +}