Index: oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndex.java =================================================================== --- oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndex.java (revision 1743675) +++ oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndex.java (working copy) @@ -24,6 +24,7 @@ import java.util.Deque; import java.util.HashSet; import java.util.Iterator; +import java.util.LinkedList; import java.util.List; import java.util.Set; import java.util.concurrent.atomic.AtomicReference; @@ -369,10 +370,20 @@ PropertyRestriction restriction = filter.getPropertyRestriction(QueryImpl.REP_EXCERPT); boolean addExcerpt = restriction != null && restriction.isNotNullRestriction(); + + Analyzer analyzer = indexNode.getDefinition().getAnalyzer(); + + if (addExcerpt) { + // setup highlighter + QueryScorer scorer = new QueryScorer(query); + scorer.setExpandMultiTermQuery(true); + highlighter.setFragmentScorer(scorer); + } + for (ScoreDoc doc : docs.scoreDocs) { String excerpt = null; if (addExcerpt) { - excerpt = getExcerpt(indexNode, searcher, query, doc); + excerpt = getExcerpt(analyzer, searcher, doc); } LuceneResultRow row = convertToRow(doc, searcher, excerpt); @@ -487,20 +498,17 @@ return new LucenePathCursor(itr, settings, sizeEstimator); } - private String getExcerpt(IndexNode indexNode, IndexSearcher searcher, Query query, ScoreDoc doc) throws IOException { + private String getExcerpt(Analyzer analyzer, IndexSearcher searcher, ScoreDoc doc) throws IOException { StringBuilder excerpt = new StringBuilder(); - QueryScorer scorer = new QueryScorer(query); - scorer.setExpandMultiTermQuery(true); - highlighter.setFragmentScorer(scorer); - Analyzer analyzer = indexNode.getDefinition().getAnalyzer(); - for (IndexableField field : searcher.getIndexReader().document(doc.doc).getFields()) - if (!FieldNames.SUGGEST.equals(field.name())) { + for (IndexableField field : searcher.getIndexReader().document(doc.doc).getFields()) { + String name = field.name(); + // only full text or analyzed fields + if (name.startsWith(FieldNames.FULLTEXT) || name.startsWith(FieldNames.ANALYZED_FIELD_PREFIX)) { + String text = field.stringValue(); + TokenStream tokenStream = analyzer.tokenStream(name, text); try { - TokenStream tokenStream = analyzer.tokenStream(field.name(), field.stringValue()); - tokenStream.reset(); - CachingTokenFilter cachingTokenFilter = new CachingTokenFilter(tokenStream); - TextFragment[] textFragments = highlighter.getBestTextFragments(cachingTokenFilter, field.stringValue(), true, 2); + TextFragment[] textFragments = highlighter.getBestTextFragments(tokenStream, text, true, 1); if (textFragments != null && textFragments.length > 0) { for (TextFragment fragment : textFragments) { if (excerpt.length() > 0) { @@ -508,11 +516,13 @@ } excerpt.append(fragment.toString()); } + break; } } catch (InvalidTokenOffsetsException e) { LOG.error("higlighting failed", e); } } + } return excerpt.toString(); } Index: oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndex.java =================================================================== --- oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndex.java (revision 1743675) +++ oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndex.java (working copy) @@ -24,14 +24,10 @@ import javax.jcr.PropertyType; import java.io.IOException; import java.util.ArrayList; -import java.util.Arrays; import java.util.Collection; import java.util.Deque; -import java.util.HashMap; import java.util.Iterator; -import java.util.LinkedList; import java.util.List; -import java.util.Map; import java.util.Set; import java.util.concurrent.atomic.AtomicReference; @@ -75,24 +71,16 @@ import org.apache.jackrabbit.oak.spi.state.NodeState; import org.apache.jackrabbit.oak.util.PerfLogger; import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.CachingTokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.document.Document; import org.apache.lucene.facet.FacetResult; import org.apache.lucene.facet.Facets; -import org.apache.lucene.facet.FacetsCollector; -import org.apache.lucene.facet.FacetsConfig; import org.apache.lucene.facet.LabelAndValue; -import org.apache.lucene.facet.MultiFacets; -import org.apache.lucene.facet.sortedset.DefaultSortedSetDocValuesReaderState; -import org.apache.lucene.facet.sortedset.SortedSetDocValuesFacetCounts; -import org.apache.lucene.facet.sortedset.SortedSetDocValuesReaderState; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexableField; import org.apache.lucene.index.MultiFields; -import org.apache.lucene.index.SortedSetDocValues; import org.apache.lucene.index.StoredFieldVisitor; import org.apache.lucene.index.Term; import org.apache.lucene.queries.CustomScoreQuery; @@ -125,14 +113,12 @@ import org.apache.lucene.search.highlight.TextFragment; import org.apache.lucene.search.spell.SuggestWord; import org.apache.lucene.search.suggest.Lookup; -import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.Version; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import static com.google.common.base.Preconditions.checkNotNull; import static com.google.common.base.Preconditions.checkState; -import static com.google.common.collect.Lists.newArrayList; import static com.google.common.collect.Lists.newArrayListWithCapacity; import static org.apache.jackrabbit.JcrConstants.JCR_MIXINTYPES; import static org.apache.jackrabbit.JcrConstants.JCR_PRIMARYTYPE; @@ -143,6 +129,7 @@ import static org.apache.jackrabbit.oak.plugins.index.lucene.FieldNames.PATH; import static org.apache.jackrabbit.oak.plugins.index.lucene.FieldNames.SUGGEST; import static org.apache.jackrabbit.oak.plugins.index.lucene.IndexDefinition.NATIVE_SORT_ORDER; +import static org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexConstants.FULL_TEXT_ENABLED; import static org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexConstants.VERSION; import static org.apache.jackrabbit.oak.plugins.index.lucene.TermFactory.newAncestorTerm; import static org.apache.jackrabbit.oak.plugins.index.lucene.TermFactory.newPathTerm; @@ -421,10 +408,19 @@ restriction = filter.getPropertyRestriction(QueryImpl.OAK_SCORE_EXPLANATION); boolean addExplain = restriction != null && restriction.isNotNullRestriction(); + Analyzer analyzer = indexNode.getDefinition().getAnalyzer(); + + if (addExcerpt) { + // setup highlighter + QueryScorer scorer = new QueryScorer(query); + scorer.setExpandMultiTermQuery(true); + highlighter.setFragmentScorer(scorer); + } + for (ScoreDoc doc : docs.scoreDocs) { String excerpt = null; if (addExcerpt) { - excerpt = getExcerpt(indexNode, searcher, query, doc); + excerpt = getExcerpt(analyzer, searcher, doc); } String explanation = null; @@ -576,20 +572,17 @@ return query; } - private String getExcerpt(IndexNode indexNode, IndexSearcher searcher, Query query, ScoreDoc doc) throws IOException { + private String getExcerpt(Analyzer analyzer, IndexSearcher searcher, ScoreDoc doc) throws IOException { StringBuilder excerpt = new StringBuilder(); - QueryScorer scorer = new QueryScorer(query); - scorer.setExpandMultiTermQuery(true); - highlighter.setFragmentScorer(scorer); - Analyzer analyzer = indexNode.getDefinition().getAnalyzer(); - for (IndexableField field : searcher.getIndexReader().document(doc.doc).getFields()) - if (!SUGGEST.equals(field.name())) { + for (IndexableField field : searcher.getIndexReader().document(doc.doc).getFields()) { + String name = field.name(); + // only full text or analyzed fields + if (name.startsWith(FieldNames.FULLTEXT) || name.startsWith(FieldNames.ANALYZED_FIELD_PREFIX)) { + String text = field.stringValue(); + TokenStream tokenStream = analyzer.tokenStream(name, text); try { - TokenStream tokenStream = analyzer.tokenStream(field.name(), field.stringValue()); - tokenStream.reset(); - CachingTokenFilter cachingTokenFilter = new CachingTokenFilter(tokenStream); - TextFragment[] textFragments = highlighter.getBestTextFragments(cachingTokenFilter, field.stringValue(), true, 2); + TextFragment[] textFragments = highlighter.getBestTextFragments(tokenStream, text, true, 1); if (textFragments != null && textFragments.length > 0) { for (TextFragment fragment : textFragments) { if (excerpt.length() > 0) { @@ -597,11 +590,13 @@ } excerpt.append(fragment.toString()); } + break; } } catch (InvalidTokenOffsetsException e) { LOG.error("higlighting failed", e); } } + } return excerpt.toString(); } Index: oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndexTest.java =================================================================== --- oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndexTest.java (revision 1743675) +++ oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndexTest.java (working copy) @@ -19,11 +19,14 @@ package org.apache.jackrabbit.oak.plugins.index.lucene; +import javax.annotation.Nonnull; +import javax.jcr.PropertyType; +import java.io.IOException; import java.io.InputStream; -import java.io.IOException; import java.text.ParseException; import java.util.Calendar; import java.util.Collections; +import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Random; @@ -31,9 +34,6 @@ import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; -import javax.annotation.Nonnull; -import javax.jcr.PropertyType; - import com.google.common.base.Charsets; import com.google.common.collect.ComparisonChain; import com.google.common.collect.ImmutableList; @@ -81,41 +81,23 @@ import static com.google.common.collect.ImmutableSet.of; import static com.google.common.collect.Lists.newArrayList; import static java.util.Arrays.asList; -import static org.apache.jackrabbit.JcrConstants.JCR_CONTENT; -import static org.apache.jackrabbit.JcrConstants.JCR_DATA; -import static org.apache.jackrabbit.JcrConstants.NT_FILE; +import static org.apache.jackrabbit.JcrConstants.*; import static org.apache.jackrabbit.oak.api.QueryEngine.NO_BINDINGS; import static org.apache.jackrabbit.oak.api.QueryEngine.NO_MAPPINGS; import static org.apache.jackrabbit.oak.api.Type.NAMES; import static org.apache.jackrabbit.oak.api.Type.STRINGS; -import static org.apache.jackrabbit.oak.plugins.index.IndexConstants.DECLARING_NODE_TYPES; -import static org.apache.jackrabbit.oak.plugins.index.IndexConstants.INDEX_DEFINITIONS_NAME; -import static org.apache.jackrabbit.oak.plugins.index.IndexConstants.INDEX_DEFINITIONS_NODE_TYPE; -import static org.apache.jackrabbit.oak.plugins.index.IndexConstants.REINDEX_PROPERTY_NAME; -import static org.apache.jackrabbit.oak.plugins.index.IndexConstants.TYPE_PROPERTY_NAME; +import static org.apache.jackrabbit.oak.plugins.index.IndexConstants.*; import static org.apache.jackrabbit.oak.plugins.index.PathFilter.PROP_EXCLUDED_PATHS; import static org.apache.jackrabbit.oak.plugins.index.PathFilter.PROP_INCLUDED_PATHS; -import static org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexConstants.INCLUDE_PROPERTY_NAMES; -import static org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexConstants.ORDERED_PROP_NAMES; -import static org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexConstants.PROPDEF_PROP_NODE_NAME; -import static org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexConstants.PROP_NAME; -import static org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexConstants.PROP_NODE; -import static org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexConstants.PROP_PROPERTY_INDEX; -import static org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexConstants.PROP_TYPE; -import static org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexConstants.TIKA; +import static org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexConstants.*; import static org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexEditorTest.createCal; import static org.apache.jackrabbit.oak.plugins.index.lucene.TestUtil.newNodeAggregator; import static org.apache.jackrabbit.oak.plugins.index.lucene.TestUtil.useV2; import static org.apache.jackrabbit.oak.plugins.index.property.OrderedIndex.OrderDirection; import static org.apache.jackrabbit.oak.plugins.memory.PropertyStates.createProperty; +import static org.hamcrest.CoreMatchers.containsString; import static org.hamcrest.CoreMatchers.not; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertFalse; -import static org.junit.Assert.assertNotEquals; -import static org.junit.Assert.assertNotNull; -import static org.junit.Assert.assertThat; -import static org.junit.Assert.assertTrue; -import static org.hamcrest.CoreMatchers.containsString; +import static org.junit.Assert.*; public class LucenePropertyIndexTest extends AbstractQueryTest { /** @@ -659,7 +641,7 @@ t.setProperty(JcrConstants.JCR_PRIMARYTYPE, typeName, Type.NAME); return t; } - + @Test public void orderByScore() throws Exception { Tree idx = createIndex("test1", of("propa")); @@ -1917,7 +1899,7 @@ assertThat(explain(propabQuery), containsString("lucene:test1(/oak:index/test1)")); assertQuery(propabQuery, asList("/test/a")); } - + @Test public void indexingPropertyWithAnalyzeButQueryWithWildcard() throws Exception { Tree index = root.getTree("/"); @@ -1935,22 +1917,22 @@ prop.setProperty(LuceneIndexConstants.PROP_PROPERTY_INDEX, true); prop.setProperty(LuceneIndexConstants.PROP_ANALYZED, true); root.commit(); - + Tree test = root.getTree("/").addChild("test"); test.addChild("a").setProperty("jcr:mimeType", "1234"); test.addChild("b").setProperty("other", "1234"); test.addChild("c").setProperty("jcr:mimeType", "a"); - root.commit(); - + root.commit(); + String query; - + query = "/jcr:root/test//*[jcr:contains(@jcr:mimeType, '1234')]"; assertThat(explainXpath(query), containsString("lucene:test2(/oak:index/test2)")); assertQuery(query, "xpath", asList("/test/a")); query = "/jcr:root/test//*[jcr:contains(., '1234')]"; assertThat(explainXpath(query), containsString("no-index")); - + query = "/jcr:root/test//*[@jcr:mimeType = '1234']"; assertThat(explainXpath(query), containsString("lucene:test2(/oak:index/test2)")); assertQuery(query, "xpath", asList("/test/a")); @@ -2095,6 +2077,47 @@ } @Test + public void longRepExcerpt() throws Exception { + Tree luceneIndex = createFullTextIndex(root.getTree("/"), "lucene"); + + root.commit(); + + StringBuilder s = new StringBuilder(); + for (int k = 0; k < 1000; k++) { + s.append("foo bar ").append(k).append(" "); + } + String text = s.toString(); + List names = new LinkedList(); + for (int j = 0; j < 30; j++) { + Tree test = root.getTree("/").addChild("ex-test-" + j); + for (int i = 0; i < 200; i++) { + String name = "cont" + i; + test.addChild(name).setProperty("text", text); + names.add("/" + test.getName() + "/" + name); + } + } + + root.commit(); + + String query; + + long s2 = System.currentTimeMillis(); + query = "SELECT [jcr:path],[rep:excerpt] from [nt:base] WHERE CONTAINS([text], 'foo')"; + assertQuery(query, SQL2, names); + long e2 = System.currentTimeMillis(); + long t2 = (e2 - s2) / 1000; + assertTrue("search took too much: " + t2 + "s", t2 < 10); + + long s3 = System.currentTimeMillis(); + query = "SELECT [jcr:path] from [nt:base] WHERE CONTAINS([text], 'foo')"; + assertQuery(query, SQL2, names); + long e3 = System.currentTimeMillis(); + long t3 = (e3 - s3) / 1000; + assertTrue("search took too much: " + t3 + "s", t3 < 10); + + } + + @Test public void emptySuggestDictionary() throws Exception{ Tree idx = createIndex("test1", of("propa", "propb")); Tree props = TestUtil.newRulePropTree(idx, "nt:base");