Index: lucene/CHANGES.txt =================================================================== --- lucene/CHANGES.txt (revision 1441376) +++ lucene/CHANGES.txt (working copy) @@ -87,6 +87,9 @@ * LUCENE-4723: Add AnalyzerFactoryTask to benchmark, and enable analyzer creation via the resulting factories using NewAnalyzerTask. (Steve Rowe) +* LUCENE-4728: Add support for highlighting CommonTermsQuery to all highlighter + implementations. (Simon Willnauer) + API Changes * LUCENE-4709: FacetResultNode no longer has a residue field. (Shai Erera) Index: lucene/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java =================================================================== --- lucene/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java (revision 1441376) +++ lucene/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java (working copy) @@ -46,6 +46,7 @@ import org.apache.lucene.index.IndexWriterConfig.OpenMode; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.Term; +import org.apache.lucene.queries.CommonTermsQuery; import org.apache.lucene.search.*; import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.search.highlight.SynonymTokenizer.TestHighlightRunner; @@ -114,6 +115,87 @@ } } + public void testHighlightingCommonTermsQuery() throws Exception { + Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true); + CommonTermsQuery query = new CommonTermsQuery(Occur.MUST, Occur.SHOULD, 3); + query.add(new Term(FIELD_NAME, "this")); + query.add(new Term(FIELD_NAME, "long")); + query.add(new Term(FIELD_NAME, "very")); + + searcher = new IndexSearcher(reader); + TopDocs hits = searcher.search(query, 10); + assertEquals(2, hits.totalHits); + QueryScorer scorer = new QueryScorer(query, FIELD_NAME); + Highlighter highlighter = new Highlighter(scorer); + + StoredDocument doc = searcher.doc(hits.scoreDocs[0].doc); + String storedField = doc.get(FIELD_NAME); + + TokenStream stream = TokenSources.getAnyTokenStream(searcher + .getIndexReader(), hits.scoreDocs[0].doc, FIELD_NAME, doc, analyzer); + Fragmenter fragmenter = new SimpleSpanFragmenter(scorer); + highlighter.setTextFragmenter(fragmenter); + String fragment = highlighter.getBestFragment(stream, storedField); + assertEquals("Hello this is a piece of text that is very long and contains too much preamble and the meat is really here which says kennedy has been shot", fragment); + + doc = searcher.doc(hits.scoreDocs[1].doc); + storedField = doc.get(FIELD_NAME); + + stream = TokenSources.getAnyTokenStream(searcher + .getIndexReader(), hits.scoreDocs[1].doc, FIELD_NAME, doc, analyzer); + highlighter.setTextFragmenter(new SimpleSpanFragmenter(scorer)); + fragment = highlighter.getBestFragment(stream, storedField); + assertEquals("This piece of text refers to Kennedy at the beginning then has a longer piece of text that is very", fragment); + } + + public void testHighlightUnknowQueryAfterRewrite() throws IOException, InvalidTokenOffsetsException { + Query query = new Query() { + + @Override + public Query rewrite(IndexReader reader) throws IOException { + CommonTermsQuery query = new CommonTermsQuery(Occur.MUST, Occur.SHOULD, 3); + query.add(new Term(FIELD_NAME, "this")); + query.add(new Term(FIELD_NAME, "long")); + query.add(new Term(FIELD_NAME, "very")); + return query; + } + + @Override + public String toString(String field) { + return null; + } + + }; + + Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true); + + searcher = new IndexSearcher(reader); + TopDocs hits = searcher.search(query, 10); + assertEquals(2, hits.totalHits); + QueryScorer scorer = new QueryScorer(query, FIELD_NAME); + Highlighter highlighter = new Highlighter(scorer); + + StoredDocument doc = searcher.doc(hits.scoreDocs[0].doc); + String storedField = doc.get(FIELD_NAME); + + TokenStream stream = TokenSources.getAnyTokenStream(searcher + .getIndexReader(), hits.scoreDocs[0].doc, FIELD_NAME, doc, analyzer); + Fragmenter fragmenter = new SimpleSpanFragmenter(scorer); + highlighter.setTextFragmenter(fragmenter); + String fragment = highlighter.getBestFragment(stream, storedField); + assertEquals("Hello this is a piece of text that is very long and contains too much preamble and the meat is really here which says kennedy has been shot", fragment); + + doc = searcher.doc(hits.scoreDocs[1].doc); + storedField = doc.get(FIELD_NAME); + + stream = TokenSources.getAnyTokenStream(searcher + .getIndexReader(), hits.scoreDocs[1].doc, FIELD_NAME, doc, analyzer); + highlighter.setTextFragmenter(new SimpleSpanFragmenter(scorer)); + fragment = highlighter.getBestFragment(stream, storedField); + assertEquals("This piece of text refers to Kennedy at the beginning then has a longer piece of text that is very", fragment); + + } + public void testHighlightingWithDefaultField() throws Exception { String s1 = "I call our world Flatland, not because we call it so,"; @@ -150,7 +232,7 @@ "Query in a named field does not result in highlighting when that field isn't in the query", s1, highlightField(q, FIELD_NAME, s1)); } - + /** * This method intended for use with testHighlightingWithDefaultField() */ @@ -603,7 +685,7 @@ // Not sure we can assert anything here - just running to check we dont // throw any exceptions } - + public void testSpanHighlighting() throws Exception { Query query1 = new SpanNearQuery(new SpanQuery[] { new SpanTermQuery(new Term(FIELD_NAME, "wordx")), Index: lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FastVectorHighlighterTest.java =================================================================== --- lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FastVectorHighlighterTest.java (revision 1441376) +++ lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FastVectorHighlighterTest.java (working copy) @@ -18,6 +18,8 @@ import java.io.IOException; import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.analysis.MockTokenFilter; +import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.FieldType; @@ -26,7 +28,13 @@ import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.Term; +import org.apache.lucene.queries.CommonTermsQuery; +import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.TopDocs; +import org.apache.lucene.search.BooleanClause.Occur; +import org.apache.lucene.search.highlight.SimpleSpanFragmenter; +import org.apache.lucene.search.highlight.TokenSources; import org.apache.lucene.store.Directory; import org.apache.lucene.util.LuceneTestCase; @@ -62,4 +70,47 @@ writer.close(); dir.close(); } + + public void testCommonTermsQueryHighlightTest() throws IOException { + Directory dir = newDirectory(); + IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true))); + FieldType type = new FieldType(TextField.TYPE_STORED); + type.setStoreTermVectorOffsets(true); + type.setStoreTermVectorPositions(true); + type.setStoreTermVectors(true); + type.freeze(); + String[] texts = { + "Hello this is a piece of text that is very long and contains too much preamble and the meat is really here which says kennedy has been shot", + "This piece of text refers to Kennedy at the beginning then has a longer piece of text that is very long in the middle and finally ends with another reference to Kennedy", + "JFK has been shot", "John Kennedy has been shot", + "This text has a typo in referring to Keneddy", + "wordx wordy wordz wordx wordy wordx worda wordb wordy wordc", "y z x y z a b", "lets is a the lets is a the lets is a the lets" }; + for (int i = 0; i < texts.length; i++) { + Document doc = new Document(); + Field field = new Field("field", texts[i], type); + doc.add(field); + writer.addDocument(doc); + } + CommonTermsQuery query = new CommonTermsQuery(Occur.MUST, Occur.SHOULD, 2); + query.add(new Term("field", "text")); + query.add(new Term("field", "long")); + query.add(new Term("field", "very")); + + FastVectorHighlighter highlighter = new FastVectorHighlighter(); + IndexReader reader = DirectoryReader.open(writer, true); + IndexSearcher searcher = new IndexSearcher(reader); + TopDocs hits = searcher.search(query, 10); + assertEquals(2, hits.totalHits); + FieldQuery fieldQuery = highlighter.getFieldQuery(query, reader); + String[] bestFragments = highlighter.getBestFragments(fieldQuery, reader, hits.scoreDocs[0].doc, "field", 1000, 1); + assertEquals("This piece of text refers to Kennedy at the beginning then has a longer piece of text that is very long in the middle and finally ends with another reference to Kennedy", bestFragments[0]); + + fieldQuery = highlighter.getFieldQuery(query, reader); + bestFragments = highlighter.getBestFragments(fieldQuery, reader, hits.scoreDocs[1].doc, "field", 1000, 1); + assertEquals("Hello this is a piece of text that is very long and contains too much preamble and the meat is really here which says kennedy has been shot", bestFragments[0]); + + reader.close(); + writer.close(); + dir.close(); + } } Index: lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FieldQueryTest.java =================================================================== --- lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FieldQueryTest.java (revision 1441376) +++ lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FieldQueryTest.java (working copy) @@ -23,8 +23,13 @@ import java.util.Map; import java.util.Set; +import org.apache.lucene.index.AtomicReaderContext; import org.apache.lucene.index.Term; import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.ConstantScoreQuery; +import org.apache.lucene.search.DocIdSet; +import org.apache.lucene.search.Filter; +import org.apache.lucene.search.FilteredQuery; import org.apache.lucene.search.PhraseQuery; import org.apache.lucene.search.PrefixQuery; import org.apache.lucene.search.Query; @@ -35,6 +40,7 @@ import org.apache.lucene.search.WildcardQuery; import org.apache.lucene.search.vectorhighlight.FieldQuery.QueryPhraseMap; import org.apache.lucene.search.vectorhighlight.FieldTermStack.TermInfo; +import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; public class FieldQueryTest extends AbstractTestCase { @@ -905,4 +911,40 @@ assertNotNull (fq.searchPhrase(F, phraseCandidate)); } + public void testStopRewrite() throws Exception { + Query q = new Query() { + + @Override + public String toString(String field) { + return "DummyQuery"; + } + + }; + make1d1fIndex( "a" ); + assertNotNull(reader); + new FieldQuery(q, reader, true, true ); + } + + public void testFlattenFilteredQuery() throws Exception { + Query query = new FilteredQuery(pqF( "A" ), new Filter() { + @Override + public DocIdSet getDocIdSet(AtomicReaderContext context, Bits acceptDocs) + throws IOException { + return null; + } + }); + FieldQuery fq = new FieldQuery( query, true, true ); + Set flatQueries = new HashSet(); + fq.flatten( query, reader, flatQueries ); + assertCollectionQueries( flatQueries, tq( "A" ) ); + } + + public void testFlattenConstantScoreQuery() throws Exception { + Query query = new ConstantScoreQuery(pqF( "A" )); + FieldQuery fq = new FieldQuery( query, true, true ); + Set flatQueries = new HashSet(); + fq.flatten( query, reader, flatQueries ); + assertCollectionQueries( flatQueries, tq( "A" ) ); + } + } Index: lucene/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java =================================================================== --- lucene/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java (revision 1441376) +++ lucene/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java (working copy) @@ -18,7 +18,6 @@ */ import java.io.IOException; import java.util.ArrayList; -import java.util.Collection; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; @@ -29,11 +28,17 @@ import org.apache.lucene.analysis.CachingTokenFilter; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.index.AtomicReader; import org.apache.lucene.index.AtomicReaderContext; +import org.apache.lucene.index.DocValues; +import org.apache.lucene.index.Fields; +import org.apache.lucene.index.FilterAtomicReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; import org.apache.lucene.index.TermContext; +import org.apache.lucene.index.Terms; import org.apache.lucene.index.memory.MemoryIndex; +import org.apache.lucene.queries.CommonTermsQuery; import org.apache.lucene.search.*; import org.apache.lucene.search.spans.FieldMaskingSpanQuery; import org.apache.lucene.search.spans.SpanFirstQuery; @@ -45,6 +50,7 @@ import org.apache.lucene.search.spans.Spans; import org.apache.lucene.util.Bits; + /** * Class used to extract {@link WeightedSpanTerm}s from a {@link Query} based on whether * {@link Term}s from the {@link Query} are contained in a supplied {@link TokenStream}. @@ -53,13 +59,14 @@ private String fieldName; private TokenStream tokenStream; - private Map readers = new HashMap(10); private String defaultField; private boolean expandMultiTermQuery; private boolean cachedTokenStream; private boolean wrapToCaching = true; private int maxDocCharsToAnalyze; + private AtomicReader reader = null; + public WeightedSpanTermExtractor() { } @@ -69,14 +76,12 @@ } } - private void closeReaders() { - Collection ctxSet = readers.values(); - - for (final AtomicReaderContext ctx : ctxSet) { + private void closeReader() { + if (this.reader != null) { try { - ctx.reader().close(); + this.reader.close(); } catch (IOException e) { - // alert? + // altert? } } } @@ -146,21 +151,12 @@ if (q != null) { extract(q, terms); } + } else if (query instanceof CommonTermsQuery) { + extractWeightedTerms(terms, query); } else if (query instanceof DisjunctionMaxQuery) { for (Iterator iterator = ((DisjunctionMaxQuery) query).iterator(); iterator.hasNext();) { extract(iterator.next(), terms); } - } else if (query instanceof MultiTermQuery && expandMultiTermQuery) { - MultiTermQuery mtq = ((MultiTermQuery)query); - if(mtq.getRewriteMethod() != MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE) { - mtq = (MultiTermQuery) mtq.clone(); - mtq.setRewriteMethod(MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE); - query = mtq; - } - if (mtq.getField() != null) { - IndexReader ir = getLeafContextForField(mtq.getField()).reader(); - extract(query.rewrite(ir), terms); - } } else if (query instanceof MultiPhraseQuery) { final MultiPhraseQuery mpq = (MultiPhraseQuery) query; final List termArrays = mpq.getTermArrays(); @@ -210,12 +206,30 @@ sp.setBoost(query.getBoost()); extractWeightedSpanTerms(terms, sp); } + } else { + Query origQuery = query; + if (query instanceof MultiTermQuery) { + if (!expandMultiTermQuery) { + return; + } + MultiTermQuery copy = (MultiTermQuery) query.clone(); + copy.setRewriteMethod(MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE); + origQuery = copy; + } + final IndexReader reader = getLeafContext().reader(); + Query rewritten = origQuery.rewrite(reader); + if (rewritten != origQuery) { + // only rewrite once and then flatten again - the rewritten query could have a speacial treatment + // if this method is overwritten in a subclass or above in the next recursion + extract(rewritten, terms); + } } extractUnknownQuery(query, terms); } protected void extractUnknownQuery(Query query, Map terms) throws IOException { + // for sub-classing to extract custom queries } @@ -249,7 +263,7 @@ final boolean mustRewriteQuery = mustRewriteQuery(spanQuery); if (mustRewriteQuery) { for (final String field : fieldNames) { - final SpanQuery rewrittenQuery = (SpanQuery) spanQuery.rewrite(getLeafContextForField(field).reader()); + final SpanQuery rewrittenQuery = (SpanQuery) spanQuery.rewrite(getLeafContext().reader()); queries.put(field, rewrittenQuery); rewrittenQuery.extractTerms(nonWeightedTerms); } @@ -266,7 +280,7 @@ } else { q = spanQuery; } - AtomicReaderContext context = getLeafContextForField(field); + AtomicReaderContext context = getLeafContext(); Map termContexts = new HashMap(); TreeSet extractedTerms = new TreeSet(); q.extractTerms(extractedTerms); @@ -338,25 +352,57 @@ return rv; } - protected AtomicReaderContext getLeafContextForField(String field) throws IOException { + protected AtomicReaderContext getLeafContext() throws IOException { if(wrapToCaching && !cachedTokenStream && !(tokenStream instanceof CachingTokenFilter)) { tokenStream = new CachingTokenFilter(new OffsetLimitTokenFilter(tokenStream, maxDocCharsToAnalyze)); cachedTokenStream = true; } - AtomicReaderContext context = readers.get(field); - if (context == null) { + if (reader == null) { MemoryIndex indexer = new MemoryIndex(); - indexer.addField(field, new OffsetLimitTokenFilter(tokenStream, maxDocCharsToAnalyze)); + indexer.addField("_some_field", new OffsetLimitTokenFilter(tokenStream, maxDocCharsToAnalyze)); tokenStream.reset(); IndexSearcher searcher = indexer.createSearcher(); // MEM index has only atomic ctx - context = (AtomicReaderContext) searcher.getTopReaderContext(); - readers.put(field, context); + reader = new DelegatingAtomicReader(((AtomicReaderContext)searcher.getTopReaderContext()).reader(), "_some_field"); } - return context; + return reader.getContext(); } + + + /* + * This reader will just delegate every call to a single field in the wrapped AtomicReader. + * This way we only need to build this field once rather than N-Times + */ + private static final class DelegatingAtomicReader extends FilterAtomicReader { + private final String field; + public DelegatingAtomicReader(AtomicReader in, String field) { + super(in); + this.field = field; + } + + @Override + public Fields fields() throws IOException { + return new FilterFields(super.fields()) { + @Override + public Terms terms(String field) throws IOException { + return super.terms(DelegatingAtomicReader.this.field); + } + }; + } + + @Override + public DocValues docValues(String field) throws IOException { + return super.docValues(this.field); + } + + @Override + public DocValues normValues(String field) throws IOException { + return super.normValues(this.field); + } + } + /** * Creates a Map of WeightedSpanTerms from the given Query and TokenStream. * @@ -401,7 +447,7 @@ try { extract(query, terms); } finally { - closeReaders(); + closeReader(); } return terms; @@ -450,7 +496,7 @@ } } finally { - closeReaders(); + closeReader(); } return terms; Index: lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldQuery.java =================================================================== --- lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldQuery.java (revision 1441376) +++ lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldQuery.java (working copy) @@ -28,9 +28,12 @@ import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; +import org.apache.lucene.queries.CommonTermsQuery; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.ConstantScoreQuery; import org.apache.lucene.search.DisjunctionMaxQuery; +import org.apache.lucene.search.FilteredQuery; import org.apache.lucene.search.MultiTermQuery; import org.apache.lucene.search.PhraseQuery; import org.apache.lucene.search.Query; @@ -92,8 +95,7 @@ if( !clause.isProhibited() ) flatten( clause.getQuery(), reader, flatQueries ); } - } - else if( sourceQuery instanceof DisjunctionMaxQuery ){ + } else if( sourceQuery instanceof DisjunctionMaxQuery ){ DisjunctionMaxQuery dmq = (DisjunctionMaxQuery)sourceQuery; for( Query query : dmq ){ flatten( query, reader, flatQueries ); @@ -103,12 +105,6 @@ if( !flatQueries.contains( sourceQuery ) ) flatQueries.add( sourceQuery ); } - else if (sourceQuery instanceof MultiTermQuery && reader != null) { - MultiTermQuery copy = (MultiTermQuery) sourceQuery.clone(); - copy.setRewriteMethod(new MultiTermQuery.TopTermsScoringBooleanQueryRewrite(MAX_MTQ_TERMS)); - BooleanQuery mtqTerms = (BooleanQuery) copy.rewrite(reader); - flatten(mtqTerms, reader, flatQueries); - } else if( sourceQuery instanceof PhraseQuery ){ if( !flatQueries.contains( sourceQuery ) ){ PhraseQuery pq = (PhraseQuery)sourceQuery; @@ -118,6 +114,31 @@ flatQueries.add( new TermQuery( pq.getTerms()[0] ) ); } } + } else if (sourceQuery instanceof ConstantScoreQuery) { + final Query q = ((ConstantScoreQuery) sourceQuery).getQuery(); + if (q != null) { + flatten(q, reader, flatQueries); + } + } else if (sourceQuery instanceof FilteredQuery) { + final Query q = ((FilteredQuery) sourceQuery).getQuery(); + if (q != null) { + flatten(q, reader, flatQueries); + } + } else if (reader != null){ + Query query = sourceQuery; + if (sourceQuery instanceof MultiTermQuery) { + MultiTermQuery copy = (MultiTermQuery) sourceQuery.clone(); + copy.setRewriteMethod(new MultiTermQuery.TopTermsScoringBooleanQueryRewrite(MAX_MTQ_TERMS)); + query = copy; + } + Query rewritten = query.rewrite(reader); + if (rewritten != query) { + // only rewrite once and then flatten again - the rewritten query could have a speacial treatment + // if this method is overwritten in a subclass. + flatten(rewritten, reader, flatQueries); + + } + // if the query is already rewritten we discard it } // else discard queries } Index: lucene/highlighter/build.xml =================================================================== --- lucene/highlighter/build.xml (revision 1441376) +++ lucene/highlighter/build.xml (working copy) @@ -27,6 +27,7 @@ + Index: dev-tools/maven/lucene/highlighter/pom.xml.template =================================================================== --- dev-tools/maven/lucene/highlighter/pom.xml.template (revision 1441376) +++ dev-tools/maven/lucene/highlighter/pom.xml.template (working copy) @@ -61,6 +61,11 @@ lucene-memory ${project.version} + + ${project.groupId} + lucene-queries + ${project.version} + ${module-path}/src/java