TokenStream.
+ * Records the underlying TokenStream as it is
+ * read and allows a rewind() or retrieval of the
+ * Tokens as a List.
+ *
+ * @author Mark Miller
+ */
+public class CachedTokenStream extends TokenStream {
+ private List tokenList = new ArrayList(100);
+ private TokenStream tokenStream;
+ private boolean canRewind;
+ private Iterator it;
+
+ /**
+ *
+ * @param tokenStream underlying TokenStream
+ */
+ public CachedTokenStream(TokenStream tokenStream) {
+ this.tokenStream = tokenStream;
+ }
+
+ /* (non-Javadoc)
+ * @see org.apache.lucene.analysis.TokenStream#close()
+ */
+ public void close() throws IOException {
+ if (this.tokenStream != null) {
+ this.tokenStream.close();
+ }
+ }
+
+ /**
+ * @return
+ */
+ public List getTokenList() {
+ if (!canRewind) {
+ throw new IllegalStateException(
+ "You must read the whole stream before retrieving the internal tokenList");
+ }
+
+ return tokenList;
+ }
+
+ /* (non-Javadoc)
+ * @see org.apache.lucene.analysis.TokenStream#next()
+ */
+ public Token next() throws IOException {
+ Token token = null;
+
+ if (it == null) {
+ token = tokenStream.next();
+
+ if (token != null) {
+ tokenList.add(token);
+ }
+ } else {
+ if (it.hasNext()) {
+ token = (Token) it.next();
+ }
+ }
+
+ if (token == null) {
+ this.tokenStream.close();
+ canRewind = true;
+ }
+
+ return token;
+ }
+
+ /**
+ *
+ */
+ public void rewind() {
+ if (!canRewind) {
+ throw new IllegalStateException(
+ "You must read the whole stream before rewinding");
+ }
+
+ it = tokenList.iterator();
+ }
+}
Index: contrib/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java
===================================================================
--- contrib/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java (revision 510468)
+++ contrib/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java (working copy)
@@ -222,11 +222,13 @@
int lastEndOffset = 0;
textFragmenter.start(text);
+ int position = -1;
TokenGroup tokenGroup=new TokenGroup();
token = tokenStream.next();
while ((token!= null)&&(token.startOffset()WeightSpanQueries using the terms from
+ * query and the position information from spans.
+ *
+ * @param query to extract terms from
+ * @param fieldName restricts terms used to this field
+ * @param terms map to place created WeightedSpanTerms in
+ * @param spans Spans for query
+ * @return
+ * @throws IOException
+ */
+ private boolean addSpans(Query query, String fieldName, Map terms,
+ Spans spans) throws IOException {
+ List lowPositions = new ArrayList();
+ List highPositions = new ArrayList();
+
+ // collect span positions
+ while (spans.next()) {
+ lowPositions.add(new Integer(spans.start()));
+ highPositions.add(new Integer(spans.end() - 1));
+ }
+
+ if (lowPositions.isEmpty()) {
+ // no spans found
+ return false;
+ }
+
+ HashSet nonWeightedTerms = new HashSet();
+ query.extractTerms(nonWeightedTerms);
+
+ for (Iterator iter = nonWeightedTerms.iterator(); iter.hasNext();) {
+ Term queryTerm = (Term) iter.next();
+
+ if ((fieldName == null) || (queryTerm.field() == fieldName)) {
+ WeightedSpanTerm weightedSpanTerm = new WeightedSpanTerm(query.getBoost(),
+ queryTerm.text());
+ weightedSpanTerm.setHighPos(highPositions);
+ weightedSpanTerm.setLowPos(lowPositions);
+ terms.put(queryTerm.text(), weightedSpanTerm);
+ }
+ }
+
+ return true;
+ }
+
+ /**
+ * Creates WeightedSpanTerms using Spans retrieved from reader.
+ * Retrieve the WeightedSpanTerms with getTerms().
+ *
+ * @param query
+ * @param fieldName
+ * @param reader
+ * @throws IOException
+ */
+ public void getWeightedSpanTerms(Query query, String fieldName,
+ IndexReader reader) throws IOException {
+ getSpans(query, fieldName, reader, terms);
+ }
+
+ /**
+ * @param query
+ * @param fieldName
+ * @param reader
+ * @param sourceIndexReader
+ * @throws IOException
+ */
+ public void getWeightedSpanTermsWithScores(Query query, String fieldName,
+ IndexReader reader, IndexReader sourceIndexReader)
+ throws IOException {
+ getSpans(query, fieldName, reader, terms);
+
+ int totalNumDocs = sourceIndexReader.numDocs();
+ Set weightedTerms = terms.keySet();
+ Iterator it = weightedTerms.iterator();
+
+ while (it.hasNext()) {
+ WeightedSpanTerm weightedSpanTerm = (WeightedSpanTerm) terms.get(it.next());
+ int docFreq = sourceIndexReader.docFreq(new Term(fieldName,
+ weightedSpanTerm.term));
+
+ // IDF algorithm taken from DefaultSimilarity class
+ float idf = (float) (Math.log((float) totalNumDocs / (double) (docFreq +
+ 1)) + 1.0);
+ weightedSpanTerm.weight *= idf;
+ }
+ }
+
+ /**
+ * @param query
+ * @param fieldName
+ * @param reader
+ * @param terms
+ * @return
+ * @throws IOException
+ */
+ private boolean getSpans(Query query, String fieldName, IndexReader reader,
+ Map terms) throws IOException {
+ if (query instanceof BooleanQuery) {
+ return getSpansFromBooleanQuery((BooleanQuery) query, fieldName,
+ reader, terms);
+ } else if (query instanceof PhraseQuery) {
+ return getSpansFromPhraseQuery((PhraseQuery) query, fieldName,
+ reader, terms);
+ } else if (query instanceof TermQuery) {
+ return getSpansFromTermQuery((TermQuery) query, fieldName, reader,
+ terms);
+ } else if (query instanceof SpanQuery) {
+ return getSpansFromSpanQuery((SpanQuery) query, fieldName, reader,
+ terms);
+ } else if (query instanceof FilteredQuery) {
+ return getTermsFromFilteredQuery((FilteredQuery) query, fieldName,
+ reader, terms);
+ } else {
+ // throw new UnsupportedOperationException("Query type is
+ // unsupported:" + query.getClass().getName());
+ return true;
+ }
+ }
+
+ /**
+ * @param query
+ * @param fieldName
+ * @param reader
+ * @param terms
+ * @return
+ * @throws IOException
+ */
+ private boolean getSpansFromBooleanQuery(BooleanQuery query,
+ String fieldName, IndexReader reader, Map terms)
+ throws IOException {
+ BooleanClause[] queryClauses = query.getClauses();
+ int i;
+
+ Map possibleTerms = new HashMap();
+
+ for (i = 0; i < queryClauses.length; i++) {
+ if (queryClauses[i].isProhibited()) {
+ Map tempTerms = new HashMap();
+
+ if (getSpans(queryClauses[i].getQuery(), fieldName, reader,
+ tempTerms)) {
+ return false;
+ }
+ } else if (queryClauses[i].isRequired()) {
+ if (!getSpans(queryClauses[i].getQuery(), fieldName, reader,
+ possibleTerms)) {
+ return false;
+ }
+ } else {
+ getSpans(queryClauses[i].getQuery(), fieldName, reader,
+ possibleTerms);
+ }
+ }
+
+ terms.putAll(possibleTerms);
+
+ return true;
+ }
+
+ /**
+ * @param query
+ * @param fieldName
+ * @param reader
+ * @param terms
+ * @return
+ * @throws IOException
+ */
+ private boolean getSpansFromPhraseQuery(PhraseQuery query,
+ String fieldName, IndexReader reader, Map terms)
+ throws IOException {
+ Term[] queryTerms = query.getTerms();
+ int i;
+ SpanQuery[] clauses = new SpanQuery[queryTerms.length];
+
+ for (i = 0; i < queryTerms.length; i++) {
+ clauses[i] = new SpanTermQuery(queryTerms[i]);
+ }
+
+ SpanNearQuery sp = new SpanNearQuery(clauses, query.getSlop(), false);
+
+ return addSpans(query, fieldName, terms, sp.getSpans(reader));
+ }
+
+ /**
+ * @param query
+ * @param fieldName
+ * @param reader
+ * @param terms
+ * @return
+ * @throws IOException
+ */
+ private boolean getSpansFromSpanQuery(SpanQuery query, String fieldName,
+ IndexReader reader, Map terms) throws IOException {
+ return addSpans(query, fieldName, terms, query.getSpans(reader));
+ }
+
+ /**
+ * @param query
+ * @param fieldName
+ * @param reader
+ * @param terms
+ * @return
+ * @throws IOException
+ */
+ private boolean getSpansFromTermQuery(TermQuery query, String fieldName,
+ IndexReader reader, Map terms) throws IOException {
+ Term term = query.getTerm();
+
+ SpanTermQuery stq = new SpanTermQuery(term);
+
+ Spans spans = stq.getSpans(reader);
+
+ return addSpans(query, fieldName, terms, spans);
+ }
+
+ /**
+ * @return WeightSpanTerms
+ */
+ public Map getTerms() {
+ return terms;
+ }
+
+ /**
+ * @param query
+ * @param fieldName
+ * @param reader
+ * @param terms
+ * @return
+ * @throws IOException
+ */
+ private boolean getTermsFromFilteredQuery(FilteredQuery query,
+ String fieldName, IndexReader reader, Map terms)
+ throws IOException {
+ return getSpans(query.getQuery(), fieldName, reader, terms);
+ }
+}
Index: contrib/highlighter/src/java/org/apache/lucene/search/highlight/Scorer.java
===================================================================
--- contrib/highlighter/src/java/org/apache/lucene/search/highlight/Scorer.java (revision 510468)
+++ contrib/highlighter/src/java/org/apache/lucene/search/highlight/Scorer.java (working copy)
@@ -36,7 +36,7 @@
* @return a score which is passed to the Highlighter class to influence the mark-up of the text
* (this return value is NOT used to score the fragment)
*/
- public float getTokenScore(Token token);
+ public float getTokenScore(Token token, int position);
/**
Index: contrib/highlighter/src/java/org/apache/lucene/search/highlight/spanscorer.html
===================================================================
--- contrib/highlighter/src/java/org/apache/lucene/search/highlight/spanscorer.html (revision 0)
+++ contrib/highlighter/src/java/org/apache/lucene/search/highlight/spanscorer.html (revision 0)
@@ -0,0 +1,54 @@
+
+
++The spanscorer package contains classes to provide the Highlighter with the abilty +to highlight the Tokens that contributed to search 'hit'. +The SpanScorer class is the central component and it will attempt to score Terms +based on whether they actaully participated in scoring the Query. +
++The implementation is very similiar to QueryScorer in that WeightedSpanTerms are extracted +from the given Query and then placed in a Map. During Token scoring, Terms found in +the Map return a score equal to their weight. The added wrinkle is that when terms are +extracted, the sub Queries that make up the Query are converted to SpanQuerys and +SpanQuery.getSpans() is applied to a MemoryIndex containing the TokenStream of the text to +be highlighted. The start and end positions of the matching Spans are recorded with the +respective WeightedSpanTerms and these positions are then used to filter possible Token +matches during scoring. This method of 'real' hit highlighting may not be 100% perfect, but +the results are very accurate and very likely acceptable. +
+
+ IndexSearcher searcher = new IndexSearcher(ramDir);
+ Query query = QueryParser.parse("Kenne*", FIELD_NAME, analyzer);
+ query = query.rewrite(reader); //required to expand search terms
+ Hits hits = searcher.search(query);
+
+ for (int i = 0; i < hits.length(); i++)
+ {
+ String text = hits.doc(i).get(FIELD_NAME);
+ CachedTokenStream tokenStream = new CachedTokenStream(analyzer.tokenStream(
+ FIELD_NAME, new StringReader(text)));
+ Highlighter highlighter = new Highlighter(this,
+ new SpanScorer(query, FIELD_NAME, tokenStream));
+ tokenStream.rewind();
+
+ // Get 3 best fragments and seperate with a "..."
+ String result = highlighter.getBestFragments(tokenStream, text, 3, "...");
+ System.out.println(result);
+ }
+
+
+
+The SpanScorer class has a constructor which can use an IndexReader to derive the IDF (inverse document frequency) +for each term in order to influcence the score. This is useful for helping to extracting the most significant sections +of a document and in supplying scores used by the GradientFormatter to color significant words more strongly. +The SpanScorer.getMaxWeight method is useful when passed to the GradientFormatter constructor to define the top score +which is associated with the top color.
+ + + + + + \ No newline at end of file Index: contrib/highlighter/src/java/org/apache/lucene/search/highlight/SpanScorer.java =================================================================== --- contrib/highlighter/src/java/org/apache/lucene/search/highlight/SpanScorer.java (revision 0) +++ contrib/highlighter/src/java/org/apache/lucene/search/highlight/SpanScorer.java (revision 0) @@ -0,0 +1,158 @@ +package org.apache.lucene.search.highlight; + +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.memory.MemoryIndex; +import org.apache.lucene.search.Query; + +import java.io.IOException; + +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; + + +/** + * @author Mark Miller + * + */ +public class SpanScorer implements Scorer { + private float totalScore; + private Set foundTerms; + private IndexReader memReader; + private Map fieldWeightedSpanTerms; + private float maxTermWeight; + + /** + * @param query + * @param field + * @param tokenStream + * @throws IOException + */ + public SpanScorer(Query query, String field, TokenStream tokenStream) + throws IOException { + init(query, field, tokenStream, null); + } + + /** + * @param query + * @param field + * @param tokenStream + * @param reader + * @throws IOException + */ + public SpanScorer(Query query, String field, TokenStream tokenStream, + IndexReader reader) throws IOException { + init(query, field, tokenStream, reader); + } + + /** + * @param weightedTerms + */ + public SpanScorer(WeightedSpanTerm[] weightedTerms) { + this.fieldWeightedSpanTerms = new HashMap(weightedTerms.length); + + for (int i = 0; i < weightedTerms.length; i++) { + WeightedSpanTerm existingTerm = (WeightedSpanTerm) fieldWeightedSpanTerms.get(weightedTerms[i].term); + + if ((existingTerm == null) || + (existingTerm.weight < weightedTerms[i].weight)) { + // if a term is defined more than once, always use the highest + // scoring weight + fieldWeightedSpanTerms.put(weightedTerms[i].term, + weightedTerms[i]); + maxTermWeight = Math.max(maxTermWeight, + weightedTerms[i].getWeight()); + } + } + } + + /** + * @param query + * @param field + * @param tokenStream + * @param reader + * @throws IOException + */ + private void init(Query query, String field, TokenStream tokenStream, + IndexReader reader) throws IOException { + CachedTokenStream cachedTokenStream = new CachedTokenStream(tokenStream); + MemoryIndex indexer = new MemoryIndex(); + + indexer.addField(field, cachedTokenStream); + + this.memReader = indexer.createSearcher().getIndexReader(); + + QuerySpansExtractor qse = new QuerySpansExtractor(); + + if (reader == null) { + qse.getWeightedSpanTerms(query, field, this.memReader); + } else { + qse.getWeightedSpanTermsWithScores(query, field, this.memReader, + reader); + } + + this.fieldWeightedSpanTerms = qse.getTerms(); + } + + /* + * (non-Javadoc) + * + * @see org.apache.lucene.search.highlight.Scorer#getFragmentScore() + */ + public float getFragmentScore() { + return totalScore; + } + + /* + * (non-Javadoc) + * + * @see org.apache.lucene.search.highlight.Scorer#getTokenScore(org.apache.lucene.analysis.Token, + * int) + */ + public float getTokenScore(Token token, int position) { + String termText = token.termText(); + + WeightedSpanTerm weightedSpanTerm; + + if ((weightedSpanTerm = (WeightedSpanTerm) fieldWeightedSpanTerms.get( + termText)) == null) { + return 0; + } + + if (!weightedSpanTerm.checkPosition(position)) { + return 0; + } + + float score = weightedSpanTerm.getWeight(); + + // found a query term - is it unique in this doc? + if (!foundTerms.contains(termText)) { + totalScore += score; + foundTerms.add(termText); + } + + return score; + } + + /* + * (non-Javadoc) + * + * @see org.apache.lucene.search.highlight.Scorer#startFragment(org.apache.lucene.search.highlight.TextFragment) + */ + public void startFragment(TextFragment newFragment) { + foundTerms = new HashSet(); + totalScore = 0; + } + + /** + * + * @return The highest weighted term (useful for passing to + * GradientFormatter to set top end of coloring scale. + */ + public float getMaxTermWeight() { + return maxTermWeight; + } +} Index: contrib/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTerm.java =================================================================== --- contrib/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTerm.java (revision 0) +++ contrib/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTerm.java (revision 0) @@ -0,0 +1,78 @@ +package org.apache.lucene.search.highlight; + +import java.util.Iterator; +import java.util.List; + + +public class WeightedSpanTerm { + float weight; // multiplier + String term; //stemmed form + private List lowPos; + private List highPos; + + public WeightedSpanTerm(float weight, String term) { + this.weight = weight; + this.term = term; + } + + public boolean checkPosition(int position) { + Iterator hit = highPos.iterator(); + Iterator lit = lowPos.iterator(); + + while (hit.hasNext()) { + Integer highInteger = (Integer) hit.next(); + Integer lowInteger = (Integer) lit.next(); + + if (((position >= lowInteger.intValue()) && + (position <= highInteger.intValue()))) { + return true; + } + } + + return false; + } + + /** + * @return the term value (stemmed) + */ + public String getTerm() { + return term; + } + + /** + * @return the weight associated with this term + */ + public float getWeight() { + return weight; + } + + /** + * @param term the term value (stemmed) + */ + public void setTerm(String term) { + this.term = term; + } + + /** + * @param weight the weight associated with this term + */ + public void setWeight(float weight) { + this.weight = weight; + } + + public List getHighPos() { + return highPos; + } + + public void setHighPos(List highPos) { + this.highPos = highPos; + } + + public List getLowPos() { + return lowPos; + } + + public void setLowPos(List lowPos) { + this.lowPos = lowPos; + } +} Index: contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java =================================================================== --- contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java (revision 510468) +++ contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java (working copy) @@ -453,7 +453,7 @@ public void startFragment(TextFragment newFragment) { } - public float getTokenScore(Token token) + public float getTokenScore(Token token, int position) { return 0; } Index: contrib/highlighter/src/test/org/apache/lucene/search/highlight/SpanHighlighterTest.java =================================================================== --- contrib/highlighter/src/test/org/apache/lucene/search/highlight/SpanHighlighterTest.java (revision 0) +++ contrib/highlighter/src/test/org/apache/lucene/search/highlight/SpanHighlighterTest.java (revision 0) @@ -0,0 +1,986 @@ +package org.apache.lucene.search.highlight; + +import junit.framework.TestCase; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.LowerCaseTokenizer; +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.WhitespaceAnalyzer; +import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.queryParser.ParseException; +import org.apache.lucene.queryParser.QueryParser; +import org.apache.lucene.search.FilteredQuery; +import org.apache.lucene.search.Hits; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.MultiSearcher; +import org.apache.lucene.search.PhraseQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.RangeFilter; +import org.apache.lucene.search.Searcher; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.highlight.CachedTokenStream; +import org.apache.lucene.search.highlight.Formatter; +import org.apache.lucene.search.highlight.Highlighter; +import org.apache.lucene.search.highlight.NullFragmenter; +import org.apache.lucene.search.highlight.SimpleFragmenter; +import org.apache.lucene.search.highlight.SimpleHTMLFormatter; +import org.apache.lucene.search.highlight.SpanScorer; +import org.apache.lucene.search.highlight.TextFragment; +import org.apache.lucene.search.highlight.TokenGroup; +import org.apache.lucene.search.highlight.WeightedSpanTerm; +import org.apache.lucene.search.spans.SpanNearQuery; +import org.apache.lucene.search.spans.SpanNotQuery; +import org.apache.lucene.search.spans.SpanQuery; +import org.apache.lucene.search.spans.SpanTermQuery; +import org.apache.lucene.store.RAMDirectory; + +import java.io.IOException; +import java.io.Reader; +import java.io.StringReader; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.StringTokenizer; + + +public class SpanHighlighterTest extends TestCase implements Formatter { + private static final String FIELD_NAME = "contents"; + private IndexReader reader; + private Query query; + RAMDirectory ramDir; + public Searcher searcher = null; + public Hits hits = null; + int numHighlights = 0; + Analyzer analyzer = new StandardAnalyzer(); + String[] texts = { + "Hello this is a piece of text that is very long and contains too much preamble and the meat is really here which says kennedy has been shot", + "This piece of text refers to Kennedy at the beginning then has a longer piece of text that is very long in the middle and finally ends with another reference to Kennedy", + "JFK has been shot", "John Kennedy has been shot", + "This text has a typo in referring to Keneddy" + }; + + protected void setUp() throws Exception { + ramDir = new RAMDirectory(); + + IndexWriter writer = new IndexWriter(ramDir, new StandardAnalyzer(), + true); + + for (int i = 0; i < texts.length; i++) { + addDoc(writer, texts[i]); + } + + writer.optimize(); + writer.close(); + reader = IndexReader.open(ramDir); + numHighlights = 0; + } + + protected void tearDown() throws Exception { + super.tearDown(); + } + + public void testSimpleSpanHighlighter() throws Exception { + doSearching("Kennedy"); + + int maxNumFragmentsRequired = 2; + + for (int i = 0; i < hits.length(); i++) { + String text = hits.doc(i).get(FIELD_NAME); + CachedTokenStream tokenStream = new CachedTokenStream(analyzer.tokenStream( + FIELD_NAME, new StringReader(text))); + Highlighter highlighter = new Highlighter(new SpanScorer(query, + FIELD_NAME, tokenStream)); + highlighter.setTextFragmenter(new SimpleFragmenter(40)); + tokenStream.rewind(); + + String result = highlighter.getBestFragments(tokenStream, text, + maxNumFragmentsRequired, "..."); + System.out.println("\t" + result); + } + + //Not sure we can assert anything here - just running to check we dont throw any exceptions + } + + public void testSimpleSpanPhraseHighlighting() throws Exception { + doSearching("\"very long and contains\""); + + int maxNumFragmentsRequired = 2; + + for (int i = 0; i < hits.length(); i++) { + String text = hits.doc(i).get(FIELD_NAME); + CachedTokenStream tokenStream = new CachedTokenStream(analyzer.tokenStream( + FIELD_NAME, new StringReader(text))); + Highlighter highlighter = new Highlighter(new SpanScorer(query, + FIELD_NAME, tokenStream)); + highlighter.setTextFragmenter(new SimpleFragmenter(40)); + tokenStream.rewind(); + + String result = highlighter.getBestFragments(tokenStream, text, + maxNumFragmentsRequired, "..."); + System.out.println("\t" + result); + } + + //Not sure we can assert anything here - just running to check we dont throw any exceptions + } + + public void testGetBestFragmentsSimpleQuery() throws Exception { + doSearching("Kennedy"); + doStandardSpanHighlights(); + assertTrue("Failed to find correct number of highlights " + + numHighlights + " found", numHighlights == 4); + } + + public void testNearSpanSimpleQuery() throws Exception { + doSearching(new SpanNearQuery( + new SpanQuery[] { + new SpanTermQuery(new Term(FIELD_NAME, "beginning")), + new SpanTermQuery(new Term(FIELD_NAME, "kennedy")) + }, 3, false)); + doStandardSpanHighlights(); + assertTrue("Failed to find correct number of highlights " + + numHighlights + " found", numHighlights == 2); + } + + public void testNotSpanSimpleQuery() throws Exception { + doSearching(new SpanNotQuery(new SpanNearQuery( + new SpanQuery[] { + new SpanTermQuery(new Term(FIELD_NAME, "shot")), + new SpanTermQuery(new Term(FIELD_NAME, "kennedy")) + }, 3, false), new SpanTermQuery(new Term(FIELD_NAME, "john") ))); + doStandardSpanHighlights(); + assertTrue("Failed to find correct number of highlights " + + numHighlights + " found", numHighlights == 4); + } + + + public void testGetFuzzyFragments() throws Exception { + doSearching("Kinnedy~"); + doStandardSpanHighlights(); + assertTrue("Failed to find correct number of highlights " + + numHighlights + " found", numHighlights == 5); + } + + public void testGetWildCardFragments() throws Exception { + doSearching("K?nnedy"); + doStandardSpanHighlights(); + assertTrue("Failed to find correct number of highlights " + + numHighlights + " found", numHighlights == 4); + } + + public void testGetMidWildCardFragments() throws Exception { + doSearching("K*dy"); + doStandardSpanHighlights(); + assertTrue("Failed to find correct number of highlights " + + numHighlights + " found", numHighlights == 5); + } + + public void testGetRangeFragments() throws Exception { + doSearching(FIELD_NAME + ":[kannedy TO kznnedy]"); //bug?needs lower case + doStandardSpanHighlights(); + assertTrue("Failed to find correct number of highlights " + + numHighlights + " found", numHighlights == 5); + } + + public void testGetBestFragmentsPhrase() throws Exception { + doSearching("\"John Kennedy\""); + doStandardSpanHighlights(); + //Currently highlights "John" and "Kennedy" separately + assertTrue("Failed to find correct number of highlights " + + numHighlights + " found", numHighlights == 2); + } + + public void testGetBestFragmentsSpan() throws Exception { + SpanQuery[] clauses = { + new SpanTermQuery(new Term("contents", "john")), + new SpanTermQuery(new Term("contents", "kennedy")), + }; + + SpanNearQuery snq = new SpanNearQuery(clauses, 1, true); + doSearching(snq); + doStandardSpanHighlights(); + //Currently highlights "John" and "Kennedy" separately + assertTrue("Failed to find correct number of highlights " + + numHighlights + " found", numHighlights == 2); + } + + public void testOffByOne() throws IOException { + String text = "help me [54-65]"; + String field = "data"; + TermQuery query = new TermQuery(new Term(field, "help")); + TokenStream tokenStream = new StandardAnalyzer().tokenStream(field, + new StringReader(text)); + Highlighter hg = new Highlighter(new SimpleHTMLFormatter(), + new SpanScorer(query, field, tokenStream)); + hg.setTextFragmenter(new NullFragmenter()); + + String match = null; + match = hg.getBestFragment(new StandardAnalyzer(), field, text); + assertEquals("help me [54-65]", match); + } + + public void testGetBestFragmentsFilteredQuery() throws Exception { + RangeFilter rf = new RangeFilter("contents", "john", "john", true, true); + SpanQuery[] clauses = { + new SpanTermQuery(new Term("contents", "john")), + new SpanTermQuery(new Term("contents", "kennedy")), + }; + SpanNearQuery snq = new SpanNearQuery(clauses, 1, true); + FilteredQuery fq = new FilteredQuery(snq, rf); + + doSearching(fq); + doStandardSpanHighlights(); + //Currently highlights "John" and "Kennedy" separately + assertTrue("Failed to find correct number of highlights " + + numHighlights + " found", numHighlights == 2); + } + + public void testGetBestFragmentsFilteredPhraseQuery() + throws Exception { + RangeFilter rf = new RangeFilter("contents", "john", "john", true, true); + PhraseQuery pq = new PhraseQuery(); + pq.add(new Term("contents", "john")); + pq.add(new Term("contents", "kennedy")); + + FilteredQuery fq = new FilteredQuery(pq, rf); + + doSearching(fq); + doStandardSpanHighlights(); + //Currently highlights "John" and "Kennedy" separately + assertTrue("Failed to find correct number of highlights " + + numHighlights + " found", numHighlights == 2); + } + + public void testGetBestFragmentsMultiTerm() throws Exception { + doSearching("John Kenn*"); + doStandardSpanHighlights(); + assertTrue("Failed to find correct number of highlights " + + numHighlights + " found", numHighlights == 5); + } + + public void testGetBestFragmentsWithOr() throws Exception { + doSearching("JFK OR Kennedy"); + doStandardSpanHighlights(); + assertTrue("Failed to find correct number of highlights " + + numHighlights + " found", numHighlights == 5); + } + + public void testGetBestSingleFragment() throws Exception { + doSearching("Kennedy"); + + for (int i = 0; i < hits.length(); i++) { + String text = hits.doc(i).get(FIELD_NAME); + + CachedTokenStream tokenStream = new CachedTokenStream(analyzer.tokenStream( + FIELD_NAME, new StringReader(text))); + Highlighter highlighter = new Highlighter(this, + new SpanScorer(query, FIELD_NAME, tokenStream)); + tokenStream.rewind(); + highlighter.setTextFragmenter(new SimpleFragmenter(40)); + + String result = highlighter.getBestFragment(tokenStream, text); + System.out.println("\t" + result); + } + + assertTrue("Failed to find correct number of highlights " + + numHighlights + " found", numHighlights == 4); + + numHighlights = 0; + + for (int i = 0; i < hits.length(); i++) { + String text = hits.doc(i).get(FIELD_NAME); + CachedTokenStream tokenStream = new CachedTokenStream(analyzer.tokenStream( + FIELD_NAME, new StringReader(text))); + Highlighter highlighter = new Highlighter(this, + new SpanScorer(query, FIELD_NAME, tokenStream)); + tokenStream.rewind(); + highlighter.setTextFragmenter(new SimpleFragmenter(40)); + + highlighter.getBestFragment(analyzer, FIELD_NAME, text); + } + + assertTrue("Failed to find correct number of highlights " + + numHighlights + " found", numHighlights == 4); + + numHighlights = 0; + + for (int i = 0; i < hits.length(); i++) { + String text = hits.doc(i).get(FIELD_NAME); + CachedTokenStream tokenStream = new CachedTokenStream(analyzer.tokenStream( + FIELD_NAME, new StringReader(text))); + Highlighter highlighter = new Highlighter(this, + new SpanScorer(query, FIELD_NAME, tokenStream)); + tokenStream.rewind(); + highlighter.getBestFragments(analyzer, FIELD_NAME, text, 10); + } + + assertTrue("Failed to find correct number of highlights " + + numHighlights + " found", numHighlights == 4); + } + + public void testGetBestSingleFragmentWithWeights() + throws Exception { + WeightedSpanTerm[] wTerms = new WeightedSpanTerm[2]; + wTerms[0] = new WeightedSpanTerm(10f, "hello"); + + List lowPos = new ArrayList(); + List highPos = new ArrayList(); + lowPos.add(new Integer(0)); + highPos.add(new Integer(0)); + wTerms[0].setLowPos(lowPos); + wTerms[0].setHighPos(highPos); + wTerms[1] = new WeightedSpanTerm(1f, "kennedy"); + lowPos = new ArrayList(); + highPos = new ArrayList(); + lowPos.add(new Integer(14)); + highPos.add(new Integer(14)); + wTerms[1].setLowPos(lowPos); + wTerms[1].setHighPos(highPos); + + TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, + new StringReader(texts[0])); + Highlighter highlighter = new Highlighter(new SpanScorer(wTerms)); + + highlighter.setTextFragmenter(new SimpleFragmenter(2)); + + String result = highlighter.getBestFragment(tokenStream, texts[0]).trim(); + assertTrue("Failed to find best section using weighted terms. Found: [" + + result + "]", "Hello".equals(result)); + + //readjust weights + wTerms[1].setWeight(50f); + tokenStream = analyzer.tokenStream(FIELD_NAME, + new StringReader(texts[0])); + highlighter = new Highlighter(new SpanScorer(wTerms)); + highlighter.setTextFragmenter(new SimpleFragmenter(2)); + + result = highlighter.getBestFragment(tokenStream, texts[0]).trim(); + assertTrue("Failed to find best section using weighted terms. Found: " + + result, "kennedy".equals(result)); + } + + public void testGetBestSingleFragmentWithWeights2() + throws Exception { + doSearching("refers kennedy"); + + CachedTokenStream tokenStream = new CachedTokenStream(analyzer.tokenStream(FIELD_NAME, + new StringReader(texts[1]))); + Highlighter highlighter = new Highlighter(new SpanScorer(query, FIELD_NAME,tokenStream,reader)); + tokenStream.rewind(); + highlighter.setTextFragmenter(new SimpleFragmenter(2)); + + String result = highlighter.getBestFragment(tokenStream, texts[1]).trim(); + assertTrue("Failed to find best section using weighted terms. Found: [" + + result + "]", "refers".equals(result)); + +} + + // tests a "complex" analyzer that produces multiple + // overlapping tokens + public void testOverlapAnalyzer() throws Exception { + HashMap synonyms = new HashMap(); + synonyms.put("football", "soccer,footie"); + + Analyzer analyzer = new SynonymAnalyzer(synonyms); + String srchkey = "football"; + + String s = "football-soccer in the euro 2004 footie competition"; + QueryParser parser = new QueryParser("bookid", analyzer); + Query query = parser.parse(srchkey); + + CachedTokenStream tokenStream = new CachedTokenStream(analyzer.tokenStream( + null, new StringReader(s))); + + Highlighter highlighter = new Highlighter(this, + new SpanScorer(query, "bookid", tokenStream)); + tokenStream.rewind(); + + // Get 3 best fragments and seperate with a "..." + String result = highlighter.getBestFragments(tokenStream, s, 3, "..."); + String expectedResult = "football-soccer in the euro 2004 footie competition"; + assertEquals(expectedResult, result); + } + + public void testGetSimpleHighlight() throws Exception { + doSearching("Kennedy"); + + for (int i = 0; i < hits.length(); i++) { + String text = hits.doc(i).get(FIELD_NAME); + CachedTokenStream tokenStream = new CachedTokenStream(analyzer.tokenStream( + FIELD_NAME, new StringReader(text))); + + Highlighter highlighter = new Highlighter(this, + new SpanScorer(query, FIELD_NAME, tokenStream)); + tokenStream.rewind(); + + String result = highlighter.getBestFragment(tokenStream, text); + System.out.println("\t" + result); + } + + assertTrue("Failed to find correct number of highlights " + + numHighlights + " found", numHighlights == 4); + } + + public void testGetTextFragments() throws Exception { + doSearching("Kennedy"); + + for (int i = 0; i < hits.length(); i++) { + String text = hits.doc(i).get(FIELD_NAME); + CachedTokenStream tokenStream = new CachedTokenStream(analyzer.tokenStream( + FIELD_NAME, new StringReader(text))); + + Highlighter highlighter = new Highlighter(this, + new SpanScorer(query, FIELD_NAME, tokenStream)); + highlighter.setTextFragmenter(new SimpleFragmenter(20)); + + tokenStream.rewind(); + + String[] stringResults = highlighter.getBestFragments(tokenStream, + text, 10); + tokenStream.rewind(); + + //tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(text)); + TextFragment[] fragmentResults = highlighter.getBestTextFragments(tokenStream, + text, true, 10); + + assertTrue("Failed to find correct number of text Fragments: " + + fragmentResults.length + " vs " + stringResults.length, + fragmentResults.length == stringResults.length); + + for (int j = 0; j < stringResults.length; j++) { + System.out.println(fragmentResults[j]); + assertTrue("Failed to find same text Fragments: " + + fragmentResults[j] + " found", + fragmentResults[j].toString().equals(stringResults[j])); + } + } + } + + public void testMaxSizeHighlight() throws Exception { + doSearching("meat"); + + CachedTokenStream tokenStream = new CachedTokenStream(analyzer.tokenStream( + FIELD_NAME, new StringReader(texts[0]))); + + Highlighter highlighter = new Highlighter(this, + new SpanScorer(query, FIELD_NAME, tokenStream)); + highlighter.setMaxDocBytesToAnalyze(30); + tokenStream.rewind(); + highlighter.getBestFragment(tokenStream, texts[0]); + assertTrue("Setting MaxDocBytesToAnalyze should have prevented " + + "us from finding matches for this record: " + numHighlights + + " found", numHighlights == 0); + } + + public void testMaxSizeHighlightTruncates() throws IOException { + String goodWord = "goodtoken"; + String[] stopWords = { "stoppedtoken" }; + + TermQuery query = new TermQuery(new Term("data", goodWord)); + SimpleHTMLFormatter fm = new SimpleHTMLFormatter(); + + String field = "data"; + + String match = null; + StringBuffer sb = new StringBuffer(); + sb.append(goodWord); + + for (int i = 0; i < 10000; i++) { + sb.append(" "); + sb.append(stopWords[0]); + } + + TokenStream tokenStream = new StandardAnalyzer(stopWords).tokenStream(field, + new StringReader(sb.toString())); + + Highlighter hg = new Highlighter(this, + new SpanScorer(query, field, tokenStream)); + hg.setTextFragmenter(new NullFragmenter()); + + hg.setMaxDocBytesToAnalyze(100); + match = hg.getBestFragment(new StandardAnalyzer(stopWords), field, + sb.toString()); + assertTrue("Matched text should be no more than 100 chars in length ", + match.length() < hg.getMaxDocBytesToAnalyze()); + + //add another tokenized word to the overrall length - but set way beyond + //the length of text under consideration (after a large slug of stop words + whitespace) + sb.append(" "); + sb.append(goodWord); + match = hg.getBestFragment(new StandardAnalyzer(stopWords), "data", + sb.toString()); + assertTrue("Matched text should be no more than 100 chars in length ", + match.length() < hg.getMaxDocBytesToAnalyze()); + } + + public void testUnRewrittenQuery() throws IOException, ParseException { + //test to show how rewritten query can still be used + searcher = new IndexSearcher(ramDir); + + Analyzer analyzer = new StandardAnalyzer(); + + QueryParser parser = new QueryParser(FIELD_NAME, analyzer); + Query query = parser.parse("JF? or Kenned*"); + System.out.println("Searching with primitive query"); + + //forget to set this and... + //query=query.rewrite(reader); + Hits hits = searcher.search(query); + + //create an instance of the highlighter with the tags used to surround highlighted text + // QueryHighlightExtractor highlighter = new QueryHighlightExtractor(this, query, new StandardAnalyzer()); + int maxNumFragmentsRequired = 3; + + for (int i = 0; i < hits.length(); i++) { + String text = hits.doc(i).get(FIELD_NAME); + CachedTokenStream tokenStream = new CachedTokenStream(analyzer.tokenStream( + FIELD_NAME, new StringReader(text))); + Highlighter highlighter = new Highlighter(this, + new SpanScorer(query, FIELD_NAME, tokenStream)); + highlighter.setTextFragmenter(new SimpleFragmenter(40)); + tokenStream.rewind(); + + String highlightedText = highlighter.getBestFragments(tokenStream, + text, maxNumFragmentsRequired, "..."); + System.out.println(highlightedText); + } + + //We expect to have zero highlights if the query is multi-terms and is not rewritten! + assertTrue("Failed to find correct number of highlights " + + numHighlights + " found", numHighlights == 0); + } + + public void testNoFragments() throws Exception { + doSearching("AnInvalidQueryWhichShouldYieldNoResults"); + + for (int i = 0; i < texts.length; i++) { + String text = texts[i]; + CachedTokenStream tokenStream = new CachedTokenStream(analyzer.tokenStream( + FIELD_NAME, new StringReader(text))); + Highlighter highlighter = new Highlighter(this, + new SpanScorer(query, FIELD_NAME, tokenStream)); + tokenStream.rewind(); + + String result = highlighter.getBestFragment(tokenStream, text); + assertNull("The highlight result should be null for text with no query terms", + result); + } + } + + public void testMultiSearcher() throws Exception { + //setup index 1 + RAMDirectory ramDir1 = new RAMDirectory(); + IndexWriter writer1 = new IndexWriter(ramDir1, new StandardAnalyzer(), + true); + Document d = new Document(); + Field f = new Field(FIELD_NAME, "multiOne", Field.Store.YES, + Field.Index.TOKENIZED); + d.add(f); + writer1.addDocument(d); + writer1.optimize(); + writer1.close(); + + IndexReader reader1 = IndexReader.open(ramDir1); + + //setup index 2 + RAMDirectory ramDir2 = new RAMDirectory(); + IndexWriter writer2 = new IndexWriter(ramDir2, new StandardAnalyzer(), + true); + d = new Document(); + f = new Field(FIELD_NAME, "multiTwo", Field.Store.YES, + Field.Index.TOKENIZED); + d.add(f); + writer2.addDocument(d); + writer2.optimize(); + writer2.close(); + + IndexReader reader2 = IndexReader.open(ramDir2); + + IndexSearcher[] searchers = new IndexSearcher[2]; + searchers[0] = new IndexSearcher(ramDir1); + searchers[1] = new IndexSearcher(ramDir2); + + MultiSearcher multiSearcher = new MultiSearcher(searchers); + QueryParser parser = new QueryParser(FIELD_NAME, new StandardAnalyzer()); + query = parser.parse("multi*"); + System.out.println("Searching for: " + query.toString(FIELD_NAME)); + //at this point the multisearcher calls combine(query[]) + hits = multiSearcher.search(query); + + //query = QueryParser.parse("multi*", FIELD_NAME, new StandardAnalyzer()); + Query[] expandedQueries = new Query[2]; + expandedQueries[0] = query.rewrite(reader1); + expandedQueries[1] = query.rewrite(reader2); + query = query.combine(expandedQueries); + + //create an instance of the highlighter with the tags used to surround highlighted text + for (int i = 0; i < hits.length(); i++) { + String text = hits.doc(i).get(FIELD_NAME); + CachedTokenStream tokenStream = new CachedTokenStream(analyzer.tokenStream( + FIELD_NAME, new StringReader(text))); + Highlighter highlighter = new Highlighter(this, + new SpanScorer(query, FIELD_NAME, tokenStream)); + tokenStream.rewind(); + + String highlightedText = highlighter.getBestFragment(tokenStream, + text); + System.out.println(highlightedText); + } + + assertTrue("Failed to find correct number of highlights " + + numHighlights + " found", numHighlights == 2); + } + + public void testFieldSpecificHighlighting() + throws IOException, ParseException { + String docMainText = "fred is one of the people"; + QueryParser parser = new QueryParser(FIELD_NAME, analyzer); + Query query = parser.parse("fred category:people"); + + //highlighting respects fieldnames used in query + TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, + new StringReader(docMainText)); + SpanScorer fieldSpecificScorer = new SpanScorer(query, FIELD_NAME, + tokenStream); + Highlighter fieldSpecificHighlighter = new Highlighter(new SimpleHTMLFormatter(), + fieldSpecificScorer); + fieldSpecificHighlighter.setTextFragmenter(new NullFragmenter()); + + String result = fieldSpecificHighlighter.getBestFragment(analyzer, + "contents", docMainText); + assertEquals("Should match", result, "fred is one of the people"); + + //highlighting does not respect fieldnames used in query + tokenStream = analyzer.tokenStream(FIELD_NAME, + new StringReader(docMainText)); + + SpanScorer fieldInSpecificScorer = new SpanScorer(query, "contents", + tokenStream); + Highlighter fieldInSpecificHighlighter = new Highlighter(new SimpleHTMLFormatter(), + fieldInSpecificScorer); + fieldInSpecificHighlighter.setTextFragmenter(new NullFragmenter()); + result = fieldInSpecificHighlighter.getBestFragment(analyzer, + FIELD_NAME, docMainText); + assertEquals("Should match", result, + "fred is one of the people"); + + reader.close(); + } + + protected TokenStream getTS2() { + //String s = "Hi-Speed10 foo"; + return new TokenStream() { + Iterator iter; + List lst; + + { + lst = new ArrayList(); + + Token t; + t = new Token("hi", 0, 2); + lst.add(t); + t = new Token("hispeed", 0, 8); + lst.add(t); + t = new Token("speed", 3, 8); + t.setPositionIncrement(0); + lst.add(t); + t = new Token("10", 8, 10); + lst.add(t); + t = new Token("foo", 11, 14); + lst.add(t); + iter = lst.iterator(); + } + + public Token next() throws IOException { + return iter.hasNext() ? (Token) iter.next() : null; + } + }; + } + + // same token-stream as above, but the bigger token comes first this time + protected TokenStream getTS2a() { + //String s = "Hi-Speed10 foo"; + return new TokenStream() { + Iterator iter; + List lst; + + { + lst = new ArrayList(); + + Token t; + t = new Token("hispeed", 0, 8); + lst.add(t); + t = new Token("hi", 0, 2); + t.setPositionIncrement(0); + lst.add(t); + t = new Token("speed", 3, 8); + lst.add(t); + t = new Token("10", 8, 10); + lst.add(t); + t = new Token("foo", 11, 14); + lst.add(t); + iter = lst.iterator(); + } + + public Token next() throws IOException { + return iter.hasNext() ? (Token) iter.next() : null; + } + }; + } + + public void testOverlapAnalyzer2() throws Exception { + String s = "Hi-Speed10 foo"; + + Query query; + Highlighter highlighter; + String result; + + query = new QueryParser("text", new WhitespaceAnalyzer()).parse("foo"); + highlighter = new Highlighter(this, + new SpanScorer(query, "text", getTS2())); + result = highlighter.getBestFragments(getTS2(), s, 3, "..."); + assertEquals("Hi-Speed10 foo", result); + + query = new QueryParser("text", new WhitespaceAnalyzer()).parse("10"); + highlighter = new Highlighter(this, + new SpanScorer(query, "text", getTS2())); + result = highlighter.getBestFragments(getTS2(), s, 3, "..."); + assertEquals("Hi-Speed10 foo", result); + + query = new QueryParser("text", new WhitespaceAnalyzer()).parse("hi"); + highlighter = new Highlighter(this, + new SpanScorer(query, "text", getTS2())); + result = highlighter.getBestFragments(getTS2(), s, 3, "..."); + assertEquals("Hi-Speed10 foo", result); + + query = new QueryParser("text", new WhitespaceAnalyzer()).parse("speed"); + highlighter = new Highlighter(this, + new SpanScorer(query, "text", getTS2())); + result = highlighter.getBestFragments(getTS2(), s, 3, "..."); + assertEquals("Hi-Speed10 foo", result); + + query = new QueryParser("text", new WhitespaceAnalyzer()).parse( + "hispeed"); + highlighter = new Highlighter(this, + new SpanScorer(query, "text", getTS2())); + result = highlighter.getBestFragments(getTS2(), s, 3, "..."); + assertEquals("Hi-Speed10 foo", result); + + query = new QueryParser("text", new WhitespaceAnalyzer()).parse( + "hi speed"); + highlighter = new Highlighter(this, + new SpanScorer(query, "text", getTS2())); + result = highlighter.getBestFragments(getTS2(), s, 3, "..."); + assertEquals("Hi-Speed10 foo", result); + + /////////////////// same tests, just put the bigger overlapping token first + query = new QueryParser("text", new WhitespaceAnalyzer()).parse("foo"); + highlighter = new Highlighter(this, + new SpanScorer(query, "text", getTS2a())); + result = highlighter.getBestFragments(getTS2a(), s, 3, "..."); + assertEquals("Hi-Speed10 foo", result); + + query = new QueryParser("text", new WhitespaceAnalyzer()).parse("10"); + highlighter = new Highlighter(this, + new SpanScorer(query, "text", getTS2a())); + result = highlighter.getBestFragments(getTS2a(), s, 3, "..."); + assertEquals("Hi-Speed10 foo", result); + + query = new QueryParser("text", new WhitespaceAnalyzer()).parse("hi"); + highlighter = new Highlighter(this, + new SpanScorer(query, "text", getTS2a())); + result = highlighter.getBestFragments(getTS2a(), s, 3, "..."); + assertEquals("Hi-Speed10 foo", result); + + query = new QueryParser("text", new WhitespaceAnalyzer()).parse("speed"); + highlighter = new Highlighter(this, + new SpanScorer(query, "text", getTS2a())); + result = highlighter.getBestFragments(getTS2a(), s, 3, "..."); + assertEquals("Hi-Speed10 foo", result); + + query = new QueryParser("text", new WhitespaceAnalyzer()).parse( + "hispeed"); + highlighter = new Highlighter(this, + new SpanScorer(query, "text", getTS2a())); + result = highlighter.getBestFragments(getTS2a(), s, 3, "..."); + assertEquals("Hi-Speed10 foo", result); + + query = new QueryParser("text", new WhitespaceAnalyzer()).parse( + "hi speed"); + highlighter = new Highlighter(this, + new SpanScorer(query, "text", getTS2a())); + result = highlighter.getBestFragments(getTS2a(), s, 3, "..."); + assertEquals("Hi-Speed10 foo", result); + } + + /* + + public void testBigramAnalyzer() throws IOException, ParseException + { + //test to ensure analyzers with none-consecutive start/end offsets + //dont double-highlight text + //setup index 1 + RAMDirectory ramDir = new RAMDirectory(); + Analyzer bigramAnalyzer=new CJKAnalyzer(); + IndexWriter writer = new IndexWriter(ramDir,bigramAnalyzer , true); + Document d = new Document(); + Field f = new Field(FIELD_NAME, "java abc def", true, true, true); + d.add(f); + writer.addDocument(d); + writer.close(); + IndexReader reader = IndexReader.open(ramDir); + + IndexSearcher searcher=new IndexSearcher(reader); + query = QueryParser.parse("abc", FIELD_NAME, bigramAnalyzer); + System.out.println("Searching for: " + query.toString(FIELD_NAME)); + hits = searcher.search(query); + + Highlighter highlighter = + new Highlighter(this,new QueryFragmentScorer(query)); + + for (int i = 0; i < hits.length(); i++) + { + String text = hits.doc(i).get(FIELD_NAME); + TokenStream tokenStream=bigramAnalyzer.tokenStream(FIELD_NAME,new StringReader(text)); + String highlightedText = highlighter.getBestFragment(tokenStream,text); + System.out.println(highlightedText); + } + + } + */ + public void doSearching(String queryString) throws Exception { + QueryParser parser = new QueryParser(FIELD_NAME, new StandardAnalyzer()); + query = parser.parse(queryString); + doSearching(query); + } + + public void doSearching(Query unReWrittenQuery) throws Exception { + searcher = new IndexSearcher(ramDir); + //for any multi-term queries to work (prefix, wildcard, range,fuzzy etc) you must use a rewritten query! + query = unReWrittenQuery.rewrite(reader); + System.out.println("Searching for: " + query.toString(FIELD_NAME)); + hits = searcher.search(query); + } + + void doStandardSpanHighlights() throws Exception { + for (int i = 0; i < hits.length(); i++) { + String text = hits.doc(i).get(FIELD_NAME); + int maxNumFragmentsRequired = 2; + String fragmentSeparator = "..."; + CachedTokenStream tokenStream = new CachedTokenStream(analyzer.tokenStream( + FIELD_NAME, new StringReader(text))); + Highlighter highlighter = new Highlighter(this, + new SpanScorer(query, FIELD_NAME, tokenStream)); + tokenStream.rewind(); + highlighter.setTextFragmenter(new SimpleFragmenter(20)); + + String result = highlighter.getBestFragments(tokenStream, text, + maxNumFragmentsRequired, fragmentSeparator); + System.out.println("\t" + result); + } + } + + private void addDoc(IndexWriter writer, String text) + throws IOException { + Document d = new Document(); + Field f = new Field(FIELD_NAME, text, Field.Store.YES, + Field.Index.TOKENIZED); + d.add(f); + writer.addDocument(d); + } + + public String highlightTerm(String originalText, TokenGroup group) { + if (group.getTotalScore() <= 0) { + return originalText; + } + + numHighlights++; //update stats used in assertions + + return "" + originalText + ""; + } + + // =================================================================== + // ========== BEGIN TEST SUPPORTING CLASSES + // ========== THESE LOOK LIKE, WITH SOME MORE EFFORT THESE COULD BE + // ========== MADE MORE GENERALLY USEFUL. + // TODO - make synonyms all interchangeable with each other and produce + // a version that does hyponyms - the "is a specialised type of ...." + // so that car = audi, bmw and volkswagen but bmw != audi so different + // behaviour to synonyms + // =================================================================== + class SynonymAnalyzer extends Analyzer { + private Map synonyms; + + public SynonymAnalyzer(Map synonyms) { + this.synonyms = synonyms; + } + + /* (non-Javadoc) + * @see org.apache.lucene.analysis.Analyzer#tokenStream(java.lang.String, java.io.Reader) + */ + public TokenStream tokenStream(String arg0, Reader arg1) { + return new SynonymTokenizer(new LowerCaseTokenizer(arg1), synonyms); + } + } + + /** + * Expands a token stream with synonyms (TODO - make the synonyms analyzed by choice of analyzer) + * @author MAHarwood + */ + class SynonymTokenizer extends TokenStream { + private TokenStream realStream; + private Token currentRealToken = null; + private Map synonyms; + StringTokenizer st = null; + + public SynonymTokenizer(TokenStream realStream, Map synonyms) { + this.realStream = realStream; + this.synonyms = synonyms; + } + + public Token next() throws IOException { + if (currentRealToken == null) { + Token nextRealToken = realStream.next(); + + if (nextRealToken == null) { + return null; + } + + String expansions = (String) synonyms.get(nextRealToken.termText()); + + if (expansions == null) { + return nextRealToken; + } + + st = new StringTokenizer(expansions, ","); + + if (st.hasMoreTokens()) { + currentRealToken = nextRealToken; + } + + return currentRealToken; + } else { + String nextExpandedValue = st.nextToken(); + Token expandedToken = new Token(nextExpandedValue, + currentRealToken.startOffset(), + currentRealToken.endOffset()); + expandedToken.setPositionIncrement(0); + + if (!st.hasMoreTokens()) { + currentRealToken = null; + st = null; + } + + return expandedToken; + } + } + } +}