Index: contrib/highlighter/build.xml =================================================================== --- contrib/highlighter/build.xml (revision 515134) +++ contrib/highlighter/build.xml (working copy) @@ -1,10 +1,29 @@ - + - Hits highlighter + Hits highlighter + + + + + + + + + + + + + + Highlighter building dependency ${memory.jar} + + + + + Index: contrib/highlighter/src/java/org/apache/lucene/search/highlight/CachedTokenStream.java =================================================================== --- contrib/highlighter/src/java/org/apache/lucene/search/highlight/CachedTokenStream.java (revision 0) +++ contrib/highlighter/src/java/org/apache/lucene/search/highlight/CachedTokenStream.java (revision 0) @@ -0,0 +1,107 @@ +package org.apache.lucene.search.highlight; +/** + * Copyright 2005 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenStream; + +import java.io.IOException; + +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; + + +/** + * Facilitates the reuse of a TokenStream. + * Records the underlying TokenStream as it is + * read and allows a rewind() or retrieval of the + * Tokens as a List. + * + * @author Mark Miller + */ +public class CachedTokenStream extends TokenStream { + private List tokenList = new ArrayList(100); + private TokenStream tokenStream; + private boolean canRewind; + private Iterator it; + + /** + * + * @param tokenStream underlying TokenStream + */ + public CachedTokenStream(TokenStream tokenStream) { + this.tokenStream = tokenStream; + } + + /* (non-Javadoc) + * @see org.apache.lucene.analysis.TokenStream#close() + */ + public void close() throws IOException { + if (this.tokenStream != null) { + this.tokenStream.close(); + } + } + + /** + * @return + */ + public List getTokenList() { + if (!canRewind) { + throw new IllegalStateException( + "You must read the whole stream before retrieving the internal tokenList"); + } + + return tokenList; + } + + /* (non-Javadoc) + * @see org.apache.lucene.analysis.TokenStream#next() + */ + public Token next() throws IOException { + Token token = null; + + if (it == null) { + token = tokenStream.next(); + + if (token != null) { + tokenList.add(token); + } + } else { + if (it.hasNext()) { + token = (Token) it.next(); + } + } + + if (token == null) { + this.tokenStream.close(); + canRewind = true; + } + + return token; + } + + /** + * + */ + public void rewind() { + if (!canRewind) { + throw new IllegalStateException( + "You must read the whole stream before rewinding"); + } + + it = tokenList.iterator(); + } +} Index: contrib/highlighter/src/java/org/apache/lucene/search/highlight/QuerySpansExtractor.java =================================================================== --- contrib/highlighter/src/java/org/apache/lucene/search/highlight/QuerySpansExtractor.java (revision 0) +++ contrib/highlighter/src/java/org/apache/lucene/search/highlight/QuerySpansExtractor.java (revision 0) @@ -0,0 +1,261 @@ +package org.apache.lucene.search.highlight; + +/** + * Copyright 2005 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.FilteredQuery; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.PhraseQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.spans.SpanNearQuery; +import org.apache.lucene.search.spans.SpanQuery; +import org.apache.lucene.search.spans.SpanTermQuery; +import org.apache.lucene.search.spans.Spans; + +import java.io.IOException; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Set; + + +/** + * @author Mark Miller Feb 15, 2007 + * @author Mark Harwood + */ +public class QuerySpansExtractor { + private Map terms = new HashMap(); + private IndexReader reader; + + /** + * Creates WeightSpanQueries using the terms from + * query and the position information from spans. + * + * @param query to extract terms from + * @param fieldName restricts terms used to this field + * @param terms map to place created WeightedSpanTerms in + * @param spans Spans for query + * @return + * @throws IOException + */ + private void addSpans(Query query, String fieldName, Map terms, Spans spans) + throws IOException { + List lowPositions = new ArrayList(); + List highPositions = new ArrayList(); + + // collect span positions + while (spans.next()) { + lowPositions.add(new Integer(spans.start())); + highPositions.add(new Integer(spans.end() - 1)); + } + + if (lowPositions.isEmpty()) { + // no spans found + return; + } + + HashSet nonWeightedTerms = new HashSet(); + query.extractTerms(nonWeightedTerms); + + for (Iterator iter = nonWeightedTerms.iterator(); iter.hasNext();) { + Term queryTerm = (Term) iter.next(); + + if ((fieldName == null) || (queryTerm.field() == fieldName)) { + WeightedSpanTerm weightedSpanTerm = new WeightedSpanTerm(query.getBoost(), + queryTerm.text()); + weightedSpanTerm.setHighPos(highPositions); + weightedSpanTerm.setLowPos(lowPositions); + terms.put(queryTerm.text(), weightedSpanTerm); + } + } + } + + /** + * Creates WeightedSpanTerms using Spans retrieved from reader. + * Retrieve the WeightedSpanTerms with getTerms(). + * + * @param query + * @param fieldName + * @param reader + * @throws IOException + */ + public void getWeightedSpanTerms(Query query, String fieldName, + IndexSearcher searcher) throws IOException { + this.reader = searcher.getIndexReader(); + getSpans(query, fieldName, terms); + } + + /** + * @param query + * @param fieldName + * @param reader + * @param indexReader + * @throws IOException + */ + public void getWeightedSpanTermsWithScores(Query query, String fieldName, + IndexSearcher searcher) throws IOException { + this.reader = searcher.getIndexReader(); + getSpans(query, fieldName, terms); + + int totalNumDocs = reader.numDocs(); + Set weightedTerms = terms.keySet(); + Iterator it = weightedTerms.iterator(); + + while (it.hasNext()) { + WeightedSpanTerm weightedSpanTerm = (WeightedSpanTerm) terms.get(it.next()); + int docFreq = reader.docFreq(new Term(fieldName, + weightedSpanTerm.term)); + + // IDF algorithm taken from DefaultSimilarity class + float idf = (float) (Math.log((float) totalNumDocs / (double) (docFreq + + 1)) + 1.0); + weightedSpanTerm.weight *= idf; + } + } + + /** + * @param query + * @param fieldName + * @param reader + * @param terms + * @return + * @throws IOException + */ + private void getSpans(Query query, String fieldName, Map terms) + throws IOException { + if (query instanceof BooleanQuery) { + getSpansFromBooleanQuery((BooleanQuery) query, fieldName, terms); + } else if (query instanceof PhraseQuery) { + getSpansFromPhraseQuery((PhraseQuery) query, fieldName, terms); + } else if (query instanceof TermQuery) { + getSpansFromTermQuery((TermQuery) query, fieldName, terms); + } else if (query instanceof SpanQuery) { + getSpansFromSpanQuery((SpanQuery) query, fieldName, terms); + } else if (query instanceof FilteredQuery) { + getTermsFromFilteredQuery((FilteredQuery) query, fieldName, terms); + } else { + } + } + + /** + * @param query + * @param fieldName + * @param reader + * @param terms + * @return + * @throws IOException + */ + private void getSpansFromBooleanQuery(BooleanQuery query, String fieldName, + Map terms) throws IOException { + BooleanClause[] queryClauses = query.getClauses(); + int i; + + Map possibleTerms = new HashMap(); + + for (i = 0; i < queryClauses.length; i++) { + System.out.println("checking clause:" + queryClauses[i]); + + if (!queryClauses[i].isProhibited()) { + getSpans(queryClauses[i].getQuery(), fieldName, possibleTerms); + } + } + + terms.putAll(possibleTerms); + } + + /** + * @param query + * @param fieldName + * @param reader + * @param terms + * @return + * @throws IOException + */ + private void getSpansFromPhraseQuery(PhraseQuery query, String fieldName, + Map terms) throws IOException { + Term[] queryTerms = query.getTerms(); + int i; + SpanQuery[] clauses = new SpanQuery[queryTerms.length]; + + for (i = 0; i < queryTerms.length; i++) { + clauses[i] = new SpanTermQuery(queryTerms[i]); + } + + SpanNearQuery sp = new SpanNearQuery(clauses, query.getSlop(), false); + + addSpans(query, fieldName, terms, sp.getSpans(reader)); + } + + /** + * @param query + * @param fieldName + * @param reader + * @param terms + * @return + * @throws IOException + */ + private void getSpansFromSpanQuery(SpanQuery query, String fieldName, + Map terms) throws IOException { + addSpans(query, fieldName, terms, query.getSpans(reader)); + } + + /** + * @param query + * @param fieldName + * @param reader + * @param terms + * @return + * @throws IOException + */ + private void getSpansFromTermQuery(TermQuery query, String fieldName, + Map terms) throws IOException { + Term term = query.getTerm(); + + SpanTermQuery stq = new SpanTermQuery(term); + + Spans spans = stq.getSpans(reader); + + addSpans(query, fieldName, terms, spans); + } + + /** + * @return WeightSpanTerms + */ + public Map getTerms() { + return terms; + } + + /** + * @param query + * @param fieldName + * @param reader + * @param terms + * @return + * @throws IOException + */ + private void getTermsFromFilteredQuery(FilteredQuery query, + String fieldName, Map terms) throws IOException { + getSpans(query.getQuery(), fieldName, terms); + } +} Index: contrib/highlighter/src/java/org/apache/lucene/search/highlight/SimpleSpanFragmenter.java =================================================================== --- contrib/highlighter/src/java/org/apache/lucene/search/highlight/SimpleSpanFragmenter.java (revision 0) +++ contrib/highlighter/src/java/org/apache/lucene/search/highlight/SimpleSpanFragmenter.java (revision 0) @@ -0,0 +1,85 @@ +package org.apache.lucene.search.highlight; +/** + * Copyright 2005 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +import org.apache.lucene.analysis.Token; + +import java.util.List; + + +/** + * {@link Fragmenter} implementation which breaks text up into same-size + * fragments but does not split up Spans. + * + * @author Mark Miller + * + */ +public class SimpleSpanFragmenter implements Fragmenter { + private static final int DEFAULT_FRAGMENT_SIZE = 100; + private int fragmentSize; + private int currentNumFrags; + private int position = -1; + private SpanScorer spanScorer; + private int waitForPos = -1; + + public SimpleSpanFragmenter(SpanScorer spanscorer) { + this(spanscorer, DEFAULT_FRAGMENT_SIZE); + } + + /** + * @param fragmentSize size in bytes of each fragment + */ + public SimpleSpanFragmenter(SpanScorer spanscorer, int fragmentSize) { + this.fragmentSize = fragmentSize; + this.spanScorer = spanscorer; + } + + public boolean isNewFragment(Token token) { + position += token.getPositionIncrement(); + + if (waitForPos == position) { + waitForPos = -1; + } else if (waitForPos != -1) { + return false; + } + + WeightedSpanTerm wSpanTerm = spanScorer.getWeightedSpanTerm(token.termText()); + + if (wSpanTerm != null) { + List lowPos = wSpanTerm.getLowPos(); + + for (int i = 0; i < lowPos.size(); i++) { + if (((Integer) lowPos.get(i)).intValue() == position) { + waitForPos = ((Integer) wSpanTerm.getHighPos().get(i)).intValue(); + + return true; + } + } + } + + boolean isNewFrag = token.endOffset() >= (fragmentSize * currentNumFrags); + + if (isNewFrag) { + currentNumFrags++; + } + + return isNewFrag; + } + + public void start(String originalText) { + position = 0; + currentNumFrags = 1; + } +} Index: contrib/highlighter/src/java/org/apache/lucene/search/highlight/spanscorer.html =================================================================== --- contrib/highlighter/src/java/org/apache/lucene/search/highlight/spanscorer.html (revision 0) +++ contrib/highlighter/src/java/org/apache/lucene/search/highlight/spanscorer.html (revision 0) @@ -0,0 +1,58 @@ + + +

+The spanscorer package contains classes to provide the Highlighter with the abilty +to highlight the Tokens that contributed to search 'hit'. +The SpanScorer class is the central component and it will attempt to score Terms +based on whether they actaully participated in scoring the Query. +

+

+The implementation is very similiar to QueryScorer in that WeightedSpanTerms are extracted +from the given Query and then placed in a Map. During Token scoring, Terms found in +the Map return a score equal to their weight. The added wrinkle is that when terms are +extracted, the sub Queries that make up the Query are converted to SpanQuerys and +SpanQuery.getSpans() is applied to a MemoryIndex containing the TokenStream of the text to +be highlighted. The start and end positions of the matching Spans are recorded with the +respective WeightedSpanTerms and these positions are then used to filter possible Token +matches during scoring. This method of 'real' hit highlighting may not be 100% perfect, but +the results are very accurate and very likely acceptable. +

+

Example Usage

+ +
+	IndexSearcher searcher = new IndexSearcher(ramDir);
+	Query query = QueryParser.parse("Kenne*", FIELD_NAME, analyzer);
+	query = query.rewrite(reader); //required to expand search terms
+	Hits hits = searcher.search(query);
+
+	for (int i = 0; i < hits.length(); i++)
+	{
+		String text = hits.doc(i).get(FIELD_NAME);
+		CachedTokenStream tokenStream = new CachedTokenStream(analyzer.tokenStream(
+                        FIELD_NAME, new StringReader(text)));
+        Highlighter highlighter = new Highlighter(this,
+                    new SpanScorer(query, FIELD_NAME, tokenStream));
+        tokenStream.rewind();
+        
+        // Get 3 best fragments and seperate with a "..."
+		String result = highlighter.getBestFragments(tokenStream, text, 3, "...");
+		System.out.println(result);
+	}
+
+ +

+If you make a call to getBestFragments() more than once, then you must call reset() on the SpanScorer +between each call. +

+ +

The SpanScorer class has a constructor which can use an IndexReader to derive the IDF (inverse document frequency) +for each term in order to influcence the score. This is useful for helping to extracting the most significant sections +of a document and in supplying scores used by the GradientFormatter to color significant words more strongly. +The SpanScorer.getMaxWeight method is useful when passed to the GradientFormatter constructor to define the top score +which is associated with the top color.

+ + + + + + Index: contrib/highlighter/src/java/org/apache/lucene/search/highlight/SpanScorer.java =================================================================== --- contrib/highlighter/src/java/org/apache/lucene/search/highlight/SpanScorer.java (revision 0) +++ contrib/highlighter/src/java/org/apache/lucene/search/highlight/SpanScorer.java (revision 0) @@ -0,0 +1,196 @@ +package org.apache.lucene.search.highlight; +/** + * Copyright 2005 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.memory.MemoryIndex; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; + +import java.io.IOException; + +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; + + +/** + * {@link Scorer} implementation which scores text fragments by the number of unique query terms found. + * This class converts all Querys to SpanQuerys and attempts to score only those terms that participated + * in generating the 'hit' on the document. + * + * @author Mark Miller + */ +public class SpanScorer implements Scorer { + private float totalScore; + private Set foundTerms; + private Map fieldWeightedSpanTerms; + private float maxTermWeight; + private int position = -1; + + /** + * @param query + * @param field + * @param tokenStream + * @throws IOException + */ + public SpanScorer(Query query, String field, TokenStream tokenStream) + throws IOException { + init(query, field, tokenStream, null); + } + + /** + * @param query + * @param field + * @param tokenStream + * @param reader + * @throws IOException + */ + public SpanScorer(Query query, String field, TokenStream tokenStream, + IndexReader reader) throws IOException { + init(query, field, tokenStream, reader); + } + + /** + * @param weightedTerms + */ + public SpanScorer(WeightedSpanTerm[] weightedTerms) { + this.fieldWeightedSpanTerms = new HashMap(weightedTerms.length); + + for (int i = 0; i < weightedTerms.length; i++) { + WeightedSpanTerm existingTerm = (WeightedSpanTerm) fieldWeightedSpanTerms.get(weightedTerms[i].term); + + if ((existingTerm == null) || + (existingTerm.weight < weightedTerms[i].weight)) { + // if a term is defined more than once, always use the highest + // scoring weight + fieldWeightedSpanTerms.put(weightedTerms[i].term, + weightedTerms[i]); + maxTermWeight = Math.max(maxTermWeight, + weightedTerms[i].getWeight()); + } + } + } + + /* + * (non-Javadoc) + * + * @see org.apache.lucene.search.highlight.Scorer#getFragmentScore() + */ + public float getFragmentScore() { + return totalScore; + } + + /** + * + * @return The highest weighted term (useful for passing to + * GradientFormatter to set top end of coloring scale. + */ + public float getMaxTermWeight() { + return maxTermWeight; + } + + /* + * (non-Javadoc) + * + * @see org.apache.lucene.search.highlight.Scorer#getTokenScore(org.apache.lucene.analysis.Token, + * int) + */ + public float getTokenScore(Token token) { + position += token.getPositionIncrement(); + + String termText = token.termText(); + + WeightedSpanTerm weightedSpanTerm; + + if ((weightedSpanTerm = (WeightedSpanTerm) fieldWeightedSpanTerms.get( + termText)) == null) { + return 0; + } + + if (!weightedSpanTerm.checkPosition(position)) { + return 0; + } + + float score = weightedSpanTerm.getWeight(); + + // found a query term - is it unique in this doc? + if (!foundTerms.contains(termText)) { + totalScore += score; + foundTerms.add(termText); + } + + return score; + } + + /** + * Retrieve the WeightedSpanTerm for the specified token. + * Useful for passing Span information to a Fragmenter. + * + * @param token + * @return WeightedSpanTerm for token + */ + public WeightedSpanTerm getWeightedSpanTerm(String token) { + return (WeightedSpanTerm) fieldWeightedSpanTerms.get(token); + } + + /** + * @param query + * @param field + * @param tokenStream + * @param reader + * @throws IOException + */ + private void init(Query query, String field, TokenStream tokenStream, + IndexReader reader) throws IOException { + CachedTokenStream cachedTokenStream = new CachedTokenStream(tokenStream); + MemoryIndex indexer = new MemoryIndex(); + + indexer.addField(field, cachedTokenStream); + + IndexSearcher searcher = indexer.createSearcher(); + + QuerySpansExtractor qse = new QuerySpansExtractor(); + + if (reader == null) { + qse.getWeightedSpanTerms(query, field, searcher); + } else { + qse.getWeightedSpanTermsWithScores(query, field, searcher); + } + + this.fieldWeightedSpanTerms = qse.getTerms(); + } + + /** + * If you call Highlighter#getBestFragment() more than once you must reset + * the SpanScorer between each call. + */ + public void reset() { + position = -1; + } + + /* + * (non-Javadoc) + * + * @see org.apache.lucene.search.highlight.Scorer#startFragment(org.apache.lucene.search.highlight.TextFragment) + */ + public void startFragment(TextFragment newFragment) { + foundTerms = new HashSet(); + totalScore = 0; + } +} Index: contrib/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTerm.java =================================================================== --- contrib/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTerm.java (revision 0) +++ contrib/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTerm.java (revision 0) @@ -0,0 +1,118 @@ +package org.apache.lucene.search.highlight; + +/** + * Copyright 2005 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +import java.util.Iterator; +import java.util.List; + + +/** + * Lightweight class to hold term, weight, and positions used for scoring this term + * + * @author Mark Miller + */ +public class WeightedSpanTerm { + float weight; // multiplier + String term; //stemmed form + private List lowPos; + private List highPos; + + /** + * @param weight + * @param term + */ + public WeightedSpanTerm(float weight, String term) { + this.weight = weight; + this.term = term; + } + + /** + * @param position + * @return + */ + public boolean checkPosition(int position) { + Iterator hit = highPos.iterator(); + Iterator lit = lowPos.iterator(); + + while (hit.hasNext()) { + Integer highInteger = (Integer) hit.next(); + Integer lowInteger = (Integer) lit.next(); + + if (((position >= lowInteger.intValue()) && + (position <= highInteger.intValue()))) { + return true; + } + } + + return false; + } + + /** + * @return the term value (stemmed) + */ + public String getTerm() { + return term; + } + + /** + * @return the weight associated with this term + */ + public float getWeight() { + return weight; + } + + /** + * @param term the term value (stemmed) + */ + public void setTerm(String term) { + this.term = term; + } + + /** + * @param weight the weight associated with this term + */ + public void setWeight(float weight) { + this.weight = weight; + } + + /** + * @return + */ + public List getHighPos() { + return highPos; + } + + /** + * @param highPos + */ + public void setHighPos(List highPos) { + this.highPos = highPos; + } + + /** + * @return + */ + public List getLowPos() { + return lowPos; + } + + /** + * @param lowPos + */ + public void setLowPos(List lowPos) { + this.lowPos = lowPos; + } +} Index: contrib/highlighter/src/test/org/apache/lucene/search/highlight/SpanHighlighterTest.java =================================================================== --- contrib/highlighter/src/test/org/apache/lucene/search/highlight/SpanHighlighterTest.java (revision 0) +++ contrib/highlighter/src/test/org/apache/lucene/search/highlight/SpanHighlighterTest.java (revision 0) @@ -0,0 +1,1042 @@ +package org.apache.lucene.search.highlight; + +import junit.framework.TestCase; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.LowerCaseTokenizer; +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.WhitespaceAnalyzer; +import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.queryParser.ParseException; +import org.apache.lucene.queryParser.QueryParser; +import org.apache.lucene.search.FilteredQuery; +import org.apache.lucene.search.Hits; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.MultiSearcher; +import org.apache.lucene.search.PhraseQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.RangeFilter; +import org.apache.lucene.search.Searcher; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.spans.SpanNearQuery; +import org.apache.lucene.search.spans.SpanNotQuery; +import org.apache.lucene.search.spans.SpanQuery; +import org.apache.lucene.search.spans.SpanTermQuery; +import org.apache.lucene.store.RAMDirectory; + +import java.io.IOException; +import java.io.Reader; +import java.io.StringReader; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.StringTokenizer; + + +public class SpanHighlighterTest extends TestCase implements Formatter { + private static final String FIELD_NAME = "contents"; + private IndexReader reader; + private Query query; + RAMDirectory ramDir; + public Searcher searcher = null; + public Hits hits = null; + int numHighlights = 0; + Analyzer analyzer = new StandardAnalyzer(); + String[] texts = { + "Hello this is a piece of text that is very long and contains too much preamble and the meat is really here which says kennedy has been shot", + "This piece of text refers to Kennedy at the beginning then has a longer piece of text that is very long in the middle and finally ends with another reference to Kennedy", + "JFK has been shot", "John Kennedy has been shot", + "This text has a typo in referring to Keneddy" + }; + + protected void setUp() throws Exception { + ramDir = new RAMDirectory(); + + IndexWriter writer = new IndexWriter(ramDir, new StandardAnalyzer(), + true); + + for (int i = 0; i < texts.length; i++) { + addDoc(writer, texts[i]); + } + + writer.optimize(); + writer.close(); + reader = IndexReader.open(ramDir); + numHighlights = 0; + } + + protected void tearDown() throws Exception { + super.tearDown(); + } + + public void testSimpleSpanHighlighter() throws Exception { + doSearching("Kennedy"); + + int maxNumFragmentsRequired = 2; + + for (int i = 0; i < hits.length(); i++) { + String text = hits.doc(i).get(FIELD_NAME); + CachedTokenStream tokenStream = new CachedTokenStream(analyzer.tokenStream( + FIELD_NAME, new StringReader(text))); + Highlighter highlighter = new Highlighter(new SpanScorer(query, + FIELD_NAME, tokenStream)); + highlighter.setTextFragmenter(new SimpleFragmenter(40)); + tokenStream.rewind(); + + String result = highlighter.getBestFragments(tokenStream, text, + maxNumFragmentsRequired, "..."); + System.out.println("\t" + result); + } + + //Not sure we can assert anything here - just running to check we dont throw any exceptions + } + + public void testSimpleSpanPhraseHighlighting() throws Exception { + doSearching("\"very long and contains\""); + + int maxNumFragmentsRequired = 2; + + for (int i = 0; i < hits.length(); i++) { + String text = hits.doc(i).get(FIELD_NAME); + CachedTokenStream tokenStream = new CachedTokenStream(analyzer.tokenStream( + FIELD_NAME, new StringReader(text))); + Highlighter highlighter = new Highlighter(new SpanScorer(query, + FIELD_NAME, tokenStream)); + highlighter.setTextFragmenter(new SimpleFragmenter(40)); + tokenStream.rewind(); + + String result = highlighter.getBestFragments(tokenStream, text, + maxNumFragmentsRequired, "..."); + System.out.println("\t" + result); + } + + //Not sure we can assert anything here - just running to check we dont throw any exceptions + } + + public void testGetBestFragmentsSimpleQuery() throws Exception { + doSearching("Kennedy"); + doStandardHighlights(); + assertTrue("Failed to find correct number of highlights " + + numHighlights + " found", numHighlights == 4); + } + + public void testNearSpanSimpleQuery() throws Exception { + doSearching(new SpanNearQuery( + new SpanQuery[] { + new SpanTermQuery(new Term(FIELD_NAME, "beginning")), + new SpanTermQuery(new Term(FIELD_NAME, "kennedy")) + }, 3, false)); + doStandardHighlights(); + assertTrue("Failed to find correct number of highlights " + + numHighlights + " found", numHighlights == 2); + } + + public void testNotSpanSimpleQuery() throws Exception { + doSearching(new SpanNotQuery( + new SpanNearQuery( + new SpanQuery[] { + new SpanTermQuery(new Term(FIELD_NAME, "shot")), + new SpanTermQuery(new Term(FIELD_NAME, "kennedy")) + }, 3, false), + new SpanTermQuery(new Term(FIELD_NAME, "john")))); + doStandardHighlights(); + assertTrue("Failed to find correct number of highlights " + + numHighlights + " found", numHighlights == 4); + } + + public void testGetFuzzyFragments() throws Exception { + doSearching("Kinnedy~"); + doStandardHighlights(); + assertTrue("Failed to find correct number of highlights " + + numHighlights + " found", numHighlights == 5); + } + + public void testGetWildCardFragments() throws Exception { + doSearching("K?nnedy"); + doStandardHighlights(); + assertTrue("Failed to find correct number of highlights " + + numHighlights + " found", numHighlights == 4); + } + + public void testGetMidWildCardFragments() throws Exception { + doSearching("K*dy"); + doStandardHighlights(); + assertTrue("Failed to find correct number of highlights " + + numHighlights + " found", numHighlights == 5); + } + + public void testGetRangeFragments() throws Exception { + doSearching(FIELD_NAME + ":[kannedy TO kznnedy]"); //bug?needs lower case + doStandardHighlights(); + assertTrue("Failed to find correct number of highlights " + + numHighlights + " found", numHighlights == 5); + } + + public void testGetBestFragmentsPhrase() throws Exception { + doSearching("\"John Kennedy\""); + doStandardHighlights(); + //Currently highlights "John" and "Kennedy" separately + assertTrue("Failed to find correct number of highlights " + + numHighlights + " found", numHighlights == 2); + } + + public void testGetBestFragmentsSpan() throws Exception { + SpanQuery[] clauses = { + new SpanTermQuery(new Term("contents", "john")), + new SpanTermQuery(new Term("contents", "kennedy")), + }; + + SpanNearQuery snq = new SpanNearQuery(clauses, 1, true); + doSearching(snq); + doStandardHighlights(); + //Currently highlights "John" and "Kennedy" separately + assertTrue("Failed to find correct number of highlights " + + numHighlights + " found", numHighlights == 2); + } + + public void testOffByOne() throws IOException { + String text = "help me [54-65]"; + String field = "data"; + TermQuery query = new TermQuery(new Term(field, "help")); + TokenStream tokenStream = new StandardAnalyzer().tokenStream(field, + new StringReader(text)); + Highlighter hg = new Highlighter(new SimpleHTMLFormatter(), + new SpanScorer(query, field, tokenStream)); + hg.setTextFragmenter(new NullFragmenter()); + + String match = null; + match = hg.getBestFragment(new StandardAnalyzer(), field, text); + assertEquals("help me [54-65]", match); + } + + public void testGetBestFragmentsFilteredQuery() throws Exception { + RangeFilter rf = new RangeFilter("contents", "john", "john", true, true); + SpanQuery[] clauses = { + new SpanTermQuery(new Term("contents", "john")), + new SpanTermQuery(new Term("contents", "kennedy")), + }; + SpanNearQuery snq = new SpanNearQuery(clauses, 1, true); + FilteredQuery fq = new FilteredQuery(snq, rf); + + doSearching(fq); + doStandardHighlights(); + //Currently highlights "John" and "Kennedy" separately + assertTrue("Failed to find correct number of highlights " + + numHighlights + " found", numHighlights == 2); + } + + public void testGetBestFragmentsFilteredPhraseQuery() + throws Exception { + RangeFilter rf = new RangeFilter("contents", "john", "john", true, true); + PhraseQuery pq = new PhraseQuery(); + pq.add(new Term("contents", "john")); + pq.add(new Term("contents", "kennedy")); + + FilteredQuery fq = new FilteredQuery(pq, rf); + + doSearching(fq); + doStandardHighlights(); + //Currently highlights "John" and "Kennedy" separately + assertTrue("Failed to find correct number of highlights " + + numHighlights + " found", numHighlights == 2); + } + + public void testGetBestFragmentsMultiTerm() throws Exception { + doSearching("John Kenn*"); + doStandardHighlights(); + assertTrue("Failed to find correct number of highlights " + + numHighlights + " found", numHighlights == 5); + } + + public void testGetBestFragmentsWithOr() throws Exception { + doSearching("JFK OR Kennedy"); + doStandardHighlights(); + assertTrue("Failed to find correct number of highlights " + + numHighlights + " found", numHighlights == 5); + } + + public void testGetBestSingleFragment() throws Exception { + doSearching("Kennedy"); + + for (int i = 0; i < hits.length(); i++) { + String text = hits.doc(i).get(FIELD_NAME); + + CachedTokenStream tokenStream = new CachedTokenStream(analyzer.tokenStream( + FIELD_NAME, new StringReader(text))); + Highlighter highlighter = new Highlighter(this, + new SpanScorer(query, FIELD_NAME, tokenStream)); + tokenStream.rewind(); + highlighter.setTextFragmenter(new SimpleFragmenter(40)); + + String result = highlighter.getBestFragment(tokenStream, text); + System.out.println("\t" + result); + } + + assertTrue("Failed to find correct number of highlights " + + numHighlights + " found", numHighlights == 4); + + numHighlights = 0; + + for (int i = 0; i < hits.length(); i++) { + String text = hits.doc(i).get(FIELD_NAME); + CachedTokenStream tokenStream = new CachedTokenStream(analyzer.tokenStream( + FIELD_NAME, new StringReader(text))); + Highlighter highlighter = new Highlighter(this, + new SpanScorer(query, FIELD_NAME, tokenStream)); + tokenStream.rewind(); + highlighter.setTextFragmenter(new SimpleFragmenter(40)); + + highlighter.getBestFragment(analyzer, FIELD_NAME, text); + } + + assertTrue("Failed to find correct number of highlights " + + numHighlights + " found", numHighlights == 4); + + numHighlights = 0; + + for (int i = 0; i < hits.length(); i++) { + String text = hits.doc(i).get(FIELD_NAME); + CachedTokenStream tokenStream = new CachedTokenStream(analyzer.tokenStream( + FIELD_NAME, new StringReader(text))); + Highlighter highlighter = new Highlighter(this, + new SpanScorer(query, FIELD_NAME, tokenStream)); + tokenStream.rewind(); + highlighter.getBestFragments(analyzer, FIELD_NAME, text, 10); + } + + assertTrue("Failed to find correct number of highlights " + + numHighlights + " found", numHighlights == 4); + } + + public void testGetBestSingleFragmentWithWeights() + throws Exception { + WeightedSpanTerm[] wTerms = new WeightedSpanTerm[2]; + wTerms[0] = new WeightedSpanTerm(10f, "hello"); + + List lowPos = new ArrayList(); + List highPos = new ArrayList(); + lowPos.add(new Integer(0)); + highPos.add(new Integer(0)); + wTerms[0].setLowPos(lowPos); + wTerms[0].setHighPos(highPos); + wTerms[1] = new WeightedSpanTerm(1f, "kennedy"); + lowPos = new ArrayList(); + highPos = new ArrayList(); + lowPos.add(new Integer(14)); + highPos.add(new Integer(14)); + wTerms[1].setLowPos(lowPos); + wTerms[1].setHighPos(highPos); + + TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, + new StringReader(texts[0])); + Highlighter highlighter = new Highlighter(new SpanScorer(wTerms)); + + highlighter.setTextFragmenter(new SimpleFragmenter(2)); + + String result = highlighter.getBestFragment(tokenStream, texts[0]).trim(); + assertTrue("Failed to find best section using weighted terms. Found: [" + + result + "]", "Hello".equals(result)); + + //readjust weights + wTerms[1].setWeight(50f); + tokenStream = analyzer.tokenStream(FIELD_NAME, + new StringReader(texts[0])); + highlighter = new Highlighter(new SpanScorer(wTerms)); + highlighter.setTextFragmenter(new SimpleFragmenter(2)); + + result = highlighter.getBestFragment(tokenStream, texts[0]).trim(); + assertTrue("Failed to find best section using weighted terms. Found: " + + result, "kennedy".equals(result)); + } + + public void testGetBestSingleFragmentWithWeights2() + throws Exception { + doSearching("refers kennedy"); + + CachedTokenStream tokenStream = new CachedTokenStream(analyzer.tokenStream( + FIELD_NAME, new StringReader(texts[1]))); + Highlighter highlighter = new Highlighter(new SpanScorer(query, + FIELD_NAME, tokenStream, reader)); + tokenStream.rewind(); + highlighter.setTextFragmenter(new SimpleFragmenter(2)); + + String result = highlighter.getBestFragment(tokenStream, texts[1]).trim(); + assertTrue("Failed to find best section using weighted terms. Found: [" + + result + "]", "refers".equals(result)); + } + + // tests a "complex" analyzer that produces multiple + // overlapping tokens + public void testOverlapAnalyzer() throws Exception { + HashMap synonyms = new HashMap(); + synonyms.put("football", "soccer,footie"); + + Analyzer analyzer = new SynonymAnalyzer(synonyms); + String srchkey = "football"; + + String s = "football-soccer in the euro 2004 footie competition"; + QueryParser parser = new QueryParser("bookid", analyzer); + Query query = parser.parse(srchkey); + + CachedTokenStream tokenStream = new CachedTokenStream(analyzer.tokenStream( + null, new StringReader(s))); + + Highlighter highlighter = new Highlighter(this, + new SpanScorer(query, "bookid", tokenStream)); + tokenStream.rewind(); + + // Get 3 best fragments and seperate with a "..." + String result = highlighter.getBestFragments(tokenStream, s, 3, "..."); + String expectedResult = "football-soccer in the euro 2004 footie competition"; + assertEquals(expectedResult, result); + } + + public void testGetSimpleHighlight() throws Exception { + doSearching("Kennedy"); + + for (int i = 0; i < hits.length(); i++) { + String text = hits.doc(i).get(FIELD_NAME); + CachedTokenStream tokenStream = new CachedTokenStream(analyzer.tokenStream( + FIELD_NAME, new StringReader(text))); + + Highlighter highlighter = new Highlighter(this, + new SpanScorer(query, FIELD_NAME, tokenStream)); + tokenStream.rewind(); + + String result = highlighter.getBestFragment(tokenStream, text); + System.out.println("\t" + result); + } + + assertTrue("Failed to find correct number of highlights " + + numHighlights + " found", numHighlights == 4); + } + + public void testGetTextFragments() throws Exception { + doSearching("Kennedy"); + + for (int i = 0; i < hits.length(); i++) { + String text = hits.doc(i).get(FIELD_NAME); + CachedTokenStream tokenStream = new CachedTokenStream(analyzer.tokenStream( + FIELD_NAME, new StringReader(text))); + + SpanScorer spanScorer = new SpanScorer(query, FIELD_NAME, tokenStream); + Highlighter highlighter = new Highlighter(this, + spanScorer); + highlighter.setTextFragmenter(new SimpleFragmenter(20)); + + tokenStream.rewind(); + + String[] stringResults = highlighter.getBestFragments(tokenStream, + text, 10); + tokenStream.rewind(); + spanScorer.reset(); + //tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(text)); + TextFragment[] fragmentResults = highlighter.getBestTextFragments(tokenStream, + text, true, 10); + + assertTrue("Failed to find correct number of text Fragments: " + + fragmentResults.length + " vs " + stringResults.length, + fragmentResults.length == stringResults.length); + + for (int j = 0; j < stringResults.length; j++) { + System.out.println(fragmentResults[j]); + assertTrue("Failed to find same text Fragments: " + + fragmentResults[j] + " found", + fragmentResults[j].toString().equals(stringResults[j])); + } + } + } + + public void testGetFragmentsSimpleSpanFragmenter() throws Exception { + doSearching(new SpanNearQuery( + new SpanQuery[] { + new SpanTermQuery(new Term(FIELD_NAME, "shot")), + new SpanTermQuery(new Term(FIELD_NAME, "kennedy")) + }, 3, false)); + + for (int i = 0; i < hits.length(); i++) { + System.out.println("Hit:" + i); + String text = hits.doc(i).get(FIELD_NAME); + System.out.println("text:" + text); + CachedTokenStream tokenStream = new CachedTokenStream(analyzer.tokenStream( + FIELD_NAME, new StringReader(text))); + + SpanScorer spanScorer = new SpanScorer(query, FIELD_NAME, tokenStream); + Highlighter highlighter = new Highlighter(this, + spanScorer); + highlighter.setTextFragmenter(new SimpleSpanFragmenter(spanScorer, 1)); + + tokenStream.rewind(); + + String[] stringResults = highlighter.getBestFragments(tokenStream, + text, 10); + tokenStream.rewind(); + spanScorer.reset(); + //tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(text)); + TextFragment[] fragmentResults = highlighter.getBestTextFragments(tokenStream, + text, true, 10); + + assertTrue("Failed to find correct number of text Fragments: " + + fragmentResults.length + " vs " + stringResults.length, + fragmentResults.length == stringResults.length); + + + for (int j = 0; j < stringResults.length; j++) { + System.out.println(fragmentResults[j]); + assertTrue("Failed to find same text Fragments: " + + fragmentResults[j] + " found", + fragmentResults[j].toString().equals(stringResults[j])); + } + } + + } + + public void testMaxSizeHighlight() throws Exception { + doSearching("meat"); + + CachedTokenStream tokenStream = new CachedTokenStream(analyzer.tokenStream( + FIELD_NAME, new StringReader(texts[0]))); + + Highlighter highlighter = new Highlighter(this, + new SpanScorer(query, FIELD_NAME, tokenStream)); + highlighter.setMaxDocBytesToAnalyze(30); + tokenStream.rewind(); + highlighter.getBestFragment(tokenStream, texts[0]); + assertTrue("Setting MaxDocBytesToAnalyze should have prevented " + + "us from finding matches for this record: " + numHighlights + + " found", numHighlights == 0); + } + + public void testMaxSizeHighlightTruncates() throws IOException { + String goodWord = "goodtoken"; + String[] stopWords = { "stoppedtoken" }; + + TermQuery query = new TermQuery(new Term("data", goodWord)); + SimpleHTMLFormatter fm = new SimpleHTMLFormatter(); + + String field = "data"; + + String match = null; + StringBuffer sb = new StringBuffer(); + sb.append(goodWord); + + for (int i = 0; i < 10000; i++) { + sb.append(" "); + sb.append(stopWords[0]); + } + + TokenStream tokenStream = new StandardAnalyzer(stopWords).tokenStream(field, + new StringReader(sb.toString())); + + SpanScorer spanScorer = new SpanScorer(query, field, tokenStream); + Highlighter hg = new Highlighter(this, + spanScorer); + hg.setTextFragmenter(new NullFragmenter()); + + hg.setMaxDocBytesToAnalyze(100); + match = hg.getBestFragment(new StandardAnalyzer(stopWords), field, + sb.toString()); + assertTrue("Matched text should be no more than 100 chars in length ", + match.length() < hg.getMaxDocBytesToAnalyze()); + + spanScorer.reset(); + //add another tokenized word to the overrall length - but set way beyond + //the length of text under consideration (after a large slug of stop words + whitespace) + sb.append(" "); + sb.append(goodWord); + match = hg.getBestFragment(new StandardAnalyzer(stopWords), "data", + sb.toString()); + assertTrue("Matched text should be no more than 100 chars in length ", + match.length() < hg.getMaxDocBytesToAnalyze()); + } + + public void testUnRewrittenQuery() throws IOException, ParseException { + //test to show how rewritten query can still be used + searcher = new IndexSearcher(ramDir); + + Analyzer analyzer = new StandardAnalyzer(); + + QueryParser parser = new QueryParser(FIELD_NAME, analyzer); + Query query = parser.parse("JF? or Kenned*"); + System.out.println("Searching with primitive query"); + + //forget to set this and... + //query=query.rewrite(reader); + Hits hits = searcher.search(query); + + //create an instance of the highlighter with the tags used to surround highlighted text + // QueryHighlightExtractor highlighter = new QueryHighlightExtractor(this, query, new StandardAnalyzer()); + int maxNumFragmentsRequired = 3; + + for (int i = 0; i < hits.length(); i++) { + String text = hits.doc(i).get(FIELD_NAME); + CachedTokenStream tokenStream = new CachedTokenStream(analyzer.tokenStream( + FIELD_NAME, new StringReader(text))); + Highlighter highlighter = new Highlighter(this, + new SpanScorer(query, FIELD_NAME, tokenStream)); + highlighter.setTextFragmenter(new SimpleFragmenter(40)); + tokenStream.rewind(); + + String highlightedText = highlighter.getBestFragments(tokenStream, + text, maxNumFragmentsRequired, "..."); + System.out.println(highlightedText); + } + + //We expect to have zero highlights if the query is multi-terms and is not rewritten! + assertTrue("Failed to find correct number of highlights " + + numHighlights + " found", numHighlights == 0); + } + + public void testNoFragments() throws Exception { + doSearching("AnInvalidQueryWhichShouldYieldNoResults"); + + for (int i = 0; i < texts.length; i++) { + String text = texts[i]; + CachedTokenStream tokenStream = new CachedTokenStream(analyzer.tokenStream( + FIELD_NAME, new StringReader(text))); + Highlighter highlighter = new Highlighter(this, + new SpanScorer(query, FIELD_NAME, tokenStream)); + tokenStream.rewind(); + + String result = highlighter.getBestFragment(tokenStream, text); + assertNull("The highlight result should be null for text with no query terms", + result); + } + } + + public void testMultiSearcher() throws Exception { + //setup index 1 + RAMDirectory ramDir1 = new RAMDirectory(); + IndexWriter writer1 = new IndexWriter(ramDir1, new StandardAnalyzer(), + true); + Document d = new Document(); + Field f = new Field(FIELD_NAME, "multiOne", Field.Store.YES, + Field.Index.TOKENIZED); + d.add(f); + writer1.addDocument(d); + writer1.optimize(); + writer1.close(); + + IndexReader reader1 = IndexReader.open(ramDir1); + + //setup index 2 + RAMDirectory ramDir2 = new RAMDirectory(); + IndexWriter writer2 = new IndexWriter(ramDir2, new StandardAnalyzer(), + true); + d = new Document(); + f = new Field(FIELD_NAME, "multiTwo", Field.Store.YES, + Field.Index.TOKENIZED); + d.add(f); + writer2.addDocument(d); + writer2.optimize(); + writer2.close(); + + IndexReader reader2 = IndexReader.open(ramDir2); + + IndexSearcher[] searchers = new IndexSearcher[2]; + searchers[0] = new IndexSearcher(ramDir1); + searchers[1] = new IndexSearcher(ramDir2); + + MultiSearcher multiSearcher = new MultiSearcher(searchers); + QueryParser parser = new QueryParser(FIELD_NAME, new StandardAnalyzer()); + query = parser.parse("multi*"); + System.out.println("Searching for: " + query.toString(FIELD_NAME)); + //at this point the multisearcher calls combine(query[]) + hits = multiSearcher.search(query); + + //query = QueryParser.parse("multi*", FIELD_NAME, new StandardAnalyzer()); + Query[] expandedQueries = new Query[2]; + expandedQueries[0] = query.rewrite(reader1); + expandedQueries[1] = query.rewrite(reader2); + query = query.combine(expandedQueries); + + //create an instance of the highlighter with the tags used to surround highlighted text + for (int i = 0; i < hits.length(); i++) { + String text = hits.doc(i).get(FIELD_NAME); + CachedTokenStream tokenStream = new CachedTokenStream(analyzer.tokenStream( + FIELD_NAME, new StringReader(text))); + Highlighter highlighter = new Highlighter(this, + new SpanScorer(query, FIELD_NAME, tokenStream)); + tokenStream.rewind(); + + String highlightedText = highlighter.getBestFragment(tokenStream, + text); + System.out.println(highlightedText); + } + + assertTrue("Failed to find correct number of highlights " + + numHighlights + " found", numHighlights == 2); + } + + public void testFieldSpecificHighlighting() + throws IOException, ParseException { + String docMainText = "fred is one of the people"; + QueryParser parser = new QueryParser(FIELD_NAME, analyzer); + Query query = parser.parse("fred category:people"); + + //highlighting respects fieldnames used in query + TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, + new StringReader(docMainText)); + SpanScorer fieldSpecificScorer = new SpanScorer(query, FIELD_NAME, + tokenStream); + Highlighter fieldSpecificHighlighter = new Highlighter(new SimpleHTMLFormatter(), + fieldSpecificScorer); + fieldSpecificHighlighter.setTextFragmenter(new NullFragmenter()); + + String result = fieldSpecificHighlighter.getBestFragment(analyzer, + "contents", docMainText); + assertEquals("Should match", result, "fred is one of the people"); + + //highlighting does not respect fieldnames used in query + tokenStream = analyzer.tokenStream(FIELD_NAME, + new StringReader(docMainText)); + + SpanScorer fieldInSpecificScorer = new SpanScorer(query, "contents", + tokenStream); + Highlighter fieldInSpecificHighlighter = new Highlighter(new SimpleHTMLFormatter(), + fieldInSpecificScorer); + fieldInSpecificHighlighter.setTextFragmenter(new NullFragmenter()); + result = fieldInSpecificHighlighter.getBestFragment(analyzer, + FIELD_NAME, docMainText); + assertEquals("Should match", result, + "fred is one of the people"); + + reader.close(); + } + + protected TokenStream getTS2() { + //String s = "Hi-Speed10 foo"; + return new TokenStream() { + Iterator iter; + List lst; + + { + lst = new ArrayList(); + + Token t; + t = new Token("hi", 0, 2); + lst.add(t); + t = new Token("hispeed", 0, 8); + lst.add(t); + t = new Token("speed", 3, 8); + t.setPositionIncrement(0); + lst.add(t); + t = new Token("10", 8, 10); + lst.add(t); + t = new Token("foo", 11, 14); + lst.add(t); + iter = lst.iterator(); + } + + public Token next() throws IOException { + return iter.hasNext() ? (Token) iter.next() : null; + } + }; + } + + // same token-stream as above, but the bigger token comes first this time + protected TokenStream getTS2a() { + //String s = "Hi-Speed10 foo"; + return new TokenStream() { + Iterator iter; + List lst; + + { + lst = new ArrayList(); + + Token t; + t = new Token("hispeed", 0, 8); + lst.add(t); + t = new Token("hi", 0, 2); + t.setPositionIncrement(0); + lst.add(t); + t = new Token("speed", 3, 8); + lst.add(t); + t = new Token("10", 8, 10); + lst.add(t); + t = new Token("foo", 11, 14); + lst.add(t); + iter = lst.iterator(); + } + + public Token next() throws IOException { + return iter.hasNext() ? (Token) iter.next() : null; + } + }; + } + + public void testOverlapAnalyzer2() throws Exception { + String s = "Hi-Speed10 foo"; + + Query query; + Highlighter highlighter; + String result; + + query = new QueryParser("text", new WhitespaceAnalyzer()).parse("foo"); + highlighter = new Highlighter(this, + new SpanScorer(query, "text", getTS2())); + result = highlighter.getBestFragments(getTS2(), s, 3, "..."); + assertEquals("Hi-Speed10 foo", result); + + query = new QueryParser("text", new WhitespaceAnalyzer()).parse("10"); + highlighter = new Highlighter(this, + new SpanScorer(query, "text", getTS2())); + result = highlighter.getBestFragments(getTS2(), s, 3, "..."); + assertEquals("Hi-Speed10 foo", result); + + query = new QueryParser("text", new WhitespaceAnalyzer()).parse("hi"); + highlighter = new Highlighter(this, + new SpanScorer(query, "text", getTS2())); + result = highlighter.getBestFragments(getTS2(), s, 3, "..."); + assertEquals("Hi-Speed10 foo", result); + + query = new QueryParser("text", new WhitespaceAnalyzer()).parse("speed"); + highlighter = new Highlighter(this, + new SpanScorer(query, "text", getTS2())); + result = highlighter.getBestFragments(getTS2(), s, 3, "..."); + assertEquals("Hi-Speed10 foo", result); + + query = new QueryParser("text", new WhitespaceAnalyzer()).parse( + "hispeed"); + highlighter = new Highlighter(this, + new SpanScorer(query, "text", getTS2())); + result = highlighter.getBestFragments(getTS2(), s, 3, "..."); + assertEquals("Hi-Speed10 foo", result); + + query = new QueryParser("text", new WhitespaceAnalyzer()).parse( + "hi speed"); + highlighter = new Highlighter(this, + new SpanScorer(query, "text", getTS2())); + result = highlighter.getBestFragments(getTS2(), s, 3, "..."); + assertEquals("Hi-Speed10 foo", result); + + /////////////////// same tests, just put the bigger overlapping token first + query = new QueryParser("text", new WhitespaceAnalyzer()).parse("foo"); + highlighter = new Highlighter(this, + new SpanScorer(query, "text", getTS2a())); + result = highlighter.getBestFragments(getTS2a(), s, 3, "..."); + assertEquals("Hi-Speed10 foo", result); + + query = new QueryParser("text", new WhitespaceAnalyzer()).parse("10"); + highlighter = new Highlighter(this, + new SpanScorer(query, "text", getTS2a())); + result = highlighter.getBestFragments(getTS2a(), s, 3, "..."); + assertEquals("Hi-Speed10 foo", result); + + query = new QueryParser("text", new WhitespaceAnalyzer()).parse("hi"); + highlighter = new Highlighter(this, + new SpanScorer(query, "text", getTS2a())); + result = highlighter.getBestFragments(getTS2a(), s, 3, "..."); + assertEquals("Hi-Speed10 foo", result); + + query = new QueryParser("text", new WhitespaceAnalyzer()).parse("speed"); + highlighter = new Highlighter(this, + new SpanScorer(query, "text", getTS2a())); + result = highlighter.getBestFragments(getTS2a(), s, 3, "..."); + assertEquals("Hi-Speed10 foo", result); + + query = new QueryParser("text", new WhitespaceAnalyzer()).parse( + "hispeed"); + highlighter = new Highlighter(this, + new SpanScorer(query, "text", getTS2a())); + result = highlighter.getBestFragments(getTS2a(), s, 3, "..."); + assertEquals("Hi-Speed10 foo", result); + + query = new QueryParser("text", new WhitespaceAnalyzer()).parse( + "hi speed"); + highlighter = new Highlighter(this, + new SpanScorer(query, "text", getTS2a())); + result = highlighter.getBestFragments(getTS2a(), s, 3, "..."); + assertEquals("Hi-Speed10 foo", result); + } + + /* + + public void testBigramAnalyzer() throws IOException, ParseException + { + //test to ensure analyzers with none-consecutive start/end offsets + //dont double-highlight text + //setup index 1 + RAMDirectory ramDir = new RAMDirectory(); + Analyzer bigramAnalyzer=new CJKAnalyzer(); + IndexWriter writer = new IndexWriter(ramDir,bigramAnalyzer , true); + Document d = new Document(); + Field f = new Field(FIELD_NAME, "java abc def", true, true, true); + d.add(f); + writer.addDocument(d); + writer.close(); + IndexReader reader = IndexReader.open(ramDir); + + IndexSearcher searcher=new IndexSearcher(reader); + query = QueryParser.parse("abc", FIELD_NAME, bigramAnalyzer); + System.out.println("Searching for: " + query.toString(FIELD_NAME)); + hits = searcher.search(query); + + Highlighter highlighter = + new Highlighter(this,new QueryFragmentScorer(query)); + + for (int i = 0; i < hits.length(); i++) + { + String text = hits.doc(i).get(FIELD_NAME); + TokenStream tokenStream=bigramAnalyzer.tokenStream(FIELD_NAME,new StringReader(text)); + String highlightedText = highlighter.getBestFragment(tokenStream,text); + System.out.println(highlightedText); + } + + } + */ + public void doSearching(String queryString) throws Exception { + QueryParser parser = new QueryParser(FIELD_NAME, new StandardAnalyzer()); + query = parser.parse(queryString); + doSearching(query); + } + + public void doSearching(Query unReWrittenQuery) throws Exception { + searcher = new IndexSearcher(ramDir); + //for any multi-term queries to work (prefix, wildcard, range,fuzzy etc) you must use a rewritten query! + query = unReWrittenQuery.rewrite(reader); + System.out.println("Searching for: " + query.toString(FIELD_NAME)); + hits = searcher.search(query); + } + + void doStandardHighlights() throws Exception { + for (int i = 0; i < hits.length(); i++) { + String text = hits.doc(i).get(FIELD_NAME); + int maxNumFragmentsRequired = 2; + String fragmentSeparator = "..."; + CachedTokenStream tokenStream = new CachedTokenStream(analyzer.tokenStream( + FIELD_NAME, new StringReader(text))); + Highlighter highlighter = new Highlighter(this, + new SpanScorer(query, FIELD_NAME, tokenStream)); + tokenStream.rewind(); + highlighter.setTextFragmenter(new SimpleFragmenter(20)); + + String result = highlighter.getBestFragments(tokenStream, text, + maxNumFragmentsRequired, fragmentSeparator); + System.out.println("\t" + result); + } + } + + void doStandardSpanHighlights() throws Exception { + for (int i = 0; i < hits.length(); i++) { + String text = hits.doc(i).get(FIELD_NAME); + int maxNumFragmentsRequired = 2; + String fragmentSeparator = "..."; + CachedTokenStream tokenStream = new CachedTokenStream(analyzer.tokenStream( + FIELD_NAME, new StringReader(text))); + Highlighter highlighter = new Highlighter(this, + new SpanScorer(query, FIELD_NAME, tokenStream)); + tokenStream.rewind(); + highlighter.setTextFragmenter(new SimpleFragmenter(20)); + + String result = highlighter.getBestFragments(tokenStream, text, + maxNumFragmentsRequired, fragmentSeparator); + System.out.println("\t" + result); + } + } + + private void addDoc(IndexWriter writer, String text) + throws IOException { + Document d = new Document(); + Field f = new Field(FIELD_NAME, text, Field.Store.YES, + Field.Index.TOKENIZED); + d.add(f); + writer.addDocument(d); + } + + public String highlightTerm(String originalText, TokenGroup group) { + if (group.getTotalScore() <= 0) { + return originalText; + } + + numHighlights++; //update stats used in assertions + + return "" + originalText + ""; + } + + // =================================================================== + // ========== BEGIN TEST SUPPORTING CLASSES + // ========== THESE LOOK LIKE, WITH SOME MORE EFFORT THESE COULD BE + // ========== MADE MORE GENERALLY USEFUL. + // TODO - make synonyms all interchangeable with each other and produce + // a version that does hyponyms - the "is a specialised type of ...." + // so that car = audi, bmw and volkswagen but bmw != audi so different + // behaviour to synonyms + // =================================================================== + class SynonymAnalyzer extends Analyzer { + private Map synonyms; + + public SynonymAnalyzer(Map synonyms) { + this.synonyms = synonyms; + } + + /* (non-Javadoc) + * @see org.apache.lucene.analysis.Analyzer#tokenStream(java.lang.String, java.io.Reader) + */ + public TokenStream tokenStream(String arg0, Reader arg1) { + return new SynonymTokenizer(new LowerCaseTokenizer(arg1), synonyms); + } + } + + /** + * Expands a token stream with synonyms (TODO - make the synonyms analyzed by choice of analyzer) + * @author MAHarwood + */ + class SynonymTokenizer extends TokenStream { + private TokenStream realStream; + private Token currentRealToken = null; + private Map synonyms; + StringTokenizer st = null; + + public SynonymTokenizer(TokenStream realStream, Map synonyms) { + this.realStream = realStream; + this.synonyms = synonyms; + } + + public Token next() throws IOException { + if (currentRealToken == null) { + Token nextRealToken = realStream.next(); + + if (nextRealToken == null) { + return null; + } + + String expansions = (String) synonyms.get(nextRealToken.termText()); + + if (expansions == null) { + return nextRealToken; + } + + st = new StringTokenizer(expansions, ","); + + if (st.hasMoreTokens()) { + currentRealToken = nextRealToken; + } + + return currentRealToken; + } else { + String nextExpandedValue = st.nextToken(); + Token expandedToken = new Token(nextExpandedValue, + currentRealToken.startOffset(), + currentRealToken.endOffset()); + expandedToken.setPositionIncrement(0); + + if (!st.hasMoreTokens()) { + currentRealToken = null; + st = null; + } + + return expandedToken; + } + } + } +}