fieldHighlights = highlightField(field, contents[i], getBreakIterator(field), terms, docids, leaves, numPassages, query);
Object[] result = new Object[docids.length];
for (int j = 0; j < docidsIn.length; j++) {
@@ -432,8 +434,18 @@
protected char getMultiValuedSeparator(String field) {
return ' ';
}
+
+ /**
+ * Returns the analyzer originally used to index the content for {@code field}.
+ *
+ * This is used to highlight some MultiTermQueries.
+ * @return Analyzer or null (the default, meaning no special multi-term processing)
+ */
+ protected Analyzer getIndexAnalyzer(String field) {
+ return null;
+ }
- private Map highlightField(String field, String contents[], BreakIterator bi, BytesRef terms[], int[] docids, List leaves, int maxPassages) throws IOException {
+ private Map highlightField(String field, String contents[], BreakIterator bi, BytesRef terms[], int[] docids, List leaves, int maxPassages, Query query) throws IOException {
Map highlights = new HashMap();
// reuse in the real sense... for docs in same segment we just advance our old enum
@@ -445,6 +457,21 @@
if (fieldFormatter == null) {
throw new NullPointerException("PassageFormatter cannot be null");
}
+
+ // check if we should do any multitermprocessing
+ Analyzer analyzer = getIndexAnalyzer(field);
+ CharacterRunAutomaton automata[] = new CharacterRunAutomaton[0];
+ if (analyzer != null) {
+ automata = MultiTermHighlighting.extractAutomata(query, field);
+ }
+
+ final BytesRef allTerms[];
+ if (automata.length > 0) {
+ allTerms = new BytesRef[terms.length + 1];
+ System.arraycopy(terms, 0, allTerms, 0, terms.length);
+ } else {
+ allTerms = terms;
+ }
for (int i = 0; i < docids.length; i++) {
String content = contents[i];
@@ -462,9 +489,14 @@
}
if (leaf != lastLeaf) {
termsEnum = t.iterator(null);
- postings = new DocsAndPositionsEnum[terms.length];
+ postings = new DocsAndPositionsEnum[allTerms.length];
}
- Passage passages[] = highlightDoc(field, terms, content.length(), bi, doc - subContext.docBase, termsEnum, postings, maxPassages);
+ if (automata.length > 0) {
+ DocsAndPositionsEnum dp = MultiTermHighlighting.getDocsEnum(analyzer.tokenStream(field, content), automata);
+ dp.advance(doc - subContext.docBase);
+ postings[terms.length] = dp;
+ }
+ Passage passages[] = highlightDoc(field, allTerms, content.length(), bi, doc - subContext.docBase, termsEnum, postings, maxPassages);
if (passages.length == 0) {
passages = getEmptyHighlight(field, bi, maxPassages);
}
@@ -593,7 +625,13 @@
int tf = 0;
while (true) {
tf++;
- current.addMatch(start, end, terms[off.id]);
+ BytesRef term = terms[off.id];
+ if (term == null) {
+ // multitermquery match, pull from payload
+ term = off.dp.getPayload();
+ assert term != null;
+ }
+ current.addMatch(start, end, term);
if (off.pos == dp.freq()) {
break; // removed from pq
} else {
Index: lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/MultiTermHighlighting.java
===================================================================
--- lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/MultiTermHighlighting.java (revision 0)
+++ lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/MultiTermHighlighting.java (working copy)
@@ -0,0 +1,260 @@
+package org.apache.lucene.search.postingshighlight;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Comparator;
+import java.util.List;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.index.DocsAndPositionsEnum;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.AutomatonQuery;
+import org.apache.lucene.search.BooleanClause;
+import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.DisjunctionMaxQuery;
+import org.apache.lucene.search.FuzzyQuery;
+import org.apache.lucene.search.PrefixQuery;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.TermRangeQuery;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.CharsRef;
+import org.apache.lucene.util.UnicodeUtil;
+import org.apache.lucene.util.automaton.Automaton;
+import org.apache.lucene.util.automaton.BasicAutomata;
+import org.apache.lucene.util.automaton.BasicOperations;
+import org.apache.lucene.util.automaton.CharacterRunAutomaton;
+import org.apache.lucene.util.automaton.LevenshteinAutomata;
+
+/**
+ * Support for highlighting multiterm queries in PostingsHighlighter.
+ */
+class MultiTermHighlighting {
+
+ /**
+ * Extracts all MultiTermQueries for {@code field}, and returns equivalent
+ * automata that will match terms.
+ */
+ static CharacterRunAutomaton[] extractAutomata(Query query, String field) {
+ List list = new ArrayList<>();
+ if (query instanceof BooleanQuery) {
+ BooleanClause clauses[] = ((BooleanQuery) query).getClauses();
+ for (BooleanClause clause : clauses) {
+ if (!clause.isProhibited()) {
+ list.addAll(Arrays.asList(extractAutomata(clause.getQuery(), field)));
+ }
+ }
+ } else if (query instanceof DisjunctionMaxQuery) {
+ for (Query sub : ((DisjunctionMaxQuery) query).getDisjuncts()) {
+ list.addAll(Arrays.asList(extractAutomata(sub, field)));
+ }
+ } else if (query instanceof AutomatonQuery) {
+ final AutomatonQuery aq = (AutomatonQuery) query;
+ if (aq.getField().equals(field)) {
+ list.add(new CharacterRunAutomaton(aq.getAutomaton()) {
+ @Override
+ public String toString() {
+ return aq.toString();
+ }
+ });
+ }
+ } else if (query instanceof PrefixQuery) {
+ final PrefixQuery pq = (PrefixQuery) query;
+ Term prefix = pq.getPrefix();
+ if (prefix.field().equals(field)) {
+ list.add(new CharacterRunAutomaton(BasicOperations.concatenate(BasicAutomata.makeString(prefix.text()),
+ BasicAutomata.makeAnyString())) {
+ @Override
+ public String toString() {
+ return pq.toString();
+ }
+ });
+ }
+ } else if (query instanceof FuzzyQuery) {
+ final FuzzyQuery fq = (FuzzyQuery) query;
+ if (fq.getField().equals(field)) {
+ String utf16 = fq.getTerm().text();
+ int termText[] = new int[utf16.codePointCount(0, utf16.length())];
+ for (int cp, i = 0, j = 0; i < utf16.length(); i += Character.charCount(cp)) {
+ termText[j++] = cp = utf16.codePointAt(i);
+ }
+ int termLength = termText.length;
+ int prefixLength = Math.min(fq.getPrefixLength(), termLength);
+ String suffix = UnicodeUtil.newString(termText, prefixLength, termText.length - prefixLength);
+ LevenshteinAutomata builder = new LevenshteinAutomata(suffix, fq.getTranspositions());
+ Automaton automaton = builder.toAutomaton(fq.getMaxEdits());
+ if (prefixLength > 0) {
+ Automaton prefix = BasicAutomata.makeString(UnicodeUtil.newString(termText, 0, prefixLength));
+ automaton = BasicOperations.concatenate(prefix, automaton);
+ }
+ list.add(new CharacterRunAutomaton(automaton) {
+ @Override
+ public String toString() {
+ return fq.toString();
+ }
+ });
+ }
+ } else if (query instanceof TermRangeQuery) {
+ final TermRangeQuery tq = (TermRangeQuery) query;
+ if (tq.getField().equals(field)) {
+ final CharsRef lowerBound;
+ if (tq.getLowerTerm() == null) {
+ lowerBound = null;
+ } else {
+ lowerBound = new CharsRef(tq.getLowerTerm().utf8ToString());
+ }
+
+ final CharsRef upperBound;
+ if (tq.getUpperTerm() == null) {
+ upperBound = null;
+ } else {
+ upperBound = new CharsRef(tq.getUpperTerm().utf8ToString());
+ }
+
+ final boolean includeLower = tq.includesLower();
+ final boolean includeUpper = tq.includesUpper();
+ final CharsRef scratch = new CharsRef();
+ final Comparator comparator = CharsRef.getUTF16SortedAsUTF8Comparator();
+
+ // this is *not* an automaton, but its very simple
+ list.add(new CharacterRunAutomaton(BasicAutomata.makeEmpty()) {
+ @Override
+ public boolean run(char[] s, int offset, int length) {
+ scratch.chars = s;
+ scratch.offset = offset;
+ scratch.length = length;
+
+ if (lowerBound != null) {
+ int cmp = comparator.compare(scratch, lowerBound);
+ if (cmp < 0 || (!includeLower && cmp == 0)) {
+ return false;
+ }
+ }
+
+ if (upperBound != null) {
+ int cmp = comparator.compare(scratch, upperBound);
+ if (cmp > 0 || (!includeUpper && cmp == 0)) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ @Override
+ public String toString() {
+ return tq.toString();
+ }
+ });
+ }
+ }
+ return list.toArray(new CharacterRunAutomaton[list.size()]);
+ }
+
+ /**
+ * Returns a "fake" DocsAndPositionsEnum over the tokenstream, returning offsets where {@code matchers}
+ * matches tokens.
+ *
+ * This is solely used internally by PostingsHighlighter: DO NOT USE THIS METHOD!
+ */
+ static DocsAndPositionsEnum getDocsEnum(final TokenStream ts, final CharacterRunAutomaton[] matchers) throws IOException {
+ final CharTermAttribute charTermAtt = ts.addAttribute(CharTermAttribute.class);
+ final OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
+ ts.reset();
+
+ // TODO: we could use CachingWrapperFilter, (or consume twice) to allow us to have a true freq()
+ // but this would have a performance cost for likely little gain in the user experience, it
+ // would only serve to make this method less bogus.
+ // instead, we always return freq() = Integer.MAX_VALUE and let PH terminate based on offset...
+
+ return new DocsAndPositionsEnum() {
+ int currentDoc = -1;
+ int currentMatch = -1;
+ int currentStartOffset = -1;
+ int currentEndOffset = -1;
+ TokenStream stream = ts;
+
+ @Override
+ public int nextPosition() throws IOException {
+ if (stream != null) {
+ while (stream.incrementToken()) {
+ for (int i = 0; i < matchers.length; i++) {
+ if (matchers[i].run(charTermAtt.buffer(), 0, charTermAtt.length())) {
+ currentStartOffset = offsetAtt.startOffset();
+ currentEndOffset = offsetAtt.endOffset();
+ currentMatch = i;
+ return 0;
+ }
+ }
+ }
+ stream.end();
+ stream.close();
+ stream = null;
+ }
+ // exhausted
+ currentStartOffset = currentEndOffset = Integer.MAX_VALUE;
+ return Integer.MAX_VALUE;
+ }
+
+ @Override
+ public int freq() throws IOException {
+ return Integer.MAX_VALUE; // lie
+ }
+
+ @Override
+ public int startOffset() throws IOException {
+ assert currentStartOffset >= 0;
+ return currentStartOffset;
+ }
+
+ @Override
+ public int endOffset() throws IOException {
+ assert currentEndOffset >= 0;
+ return currentEndOffset;
+ }
+
+ @Override
+ public BytesRef getPayload() throws IOException {
+ return new BytesRef(matchers[currentMatch].toString());
+ }
+
+ @Override
+ public int docID() {
+ return currentDoc;
+ }
+
+ @Override
+ public int nextDoc() throws IOException {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public int advance(int target) throws IOException {
+ return currentDoc = target;
+ }
+
+ @Override
+ public long cost() {
+ return 0;
+ }
+ };
+ }
+}
Property changes on: lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/MultiTermHighlighting.java
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property