Index: lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighter.java =================================================================== --- lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighter.java (revision 1512250) +++ lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighter.java (working copy) @@ -21,6 +21,8 @@ import java.io.IOException; import java.io.InputStreamReader; import java.text.BreakIterator; +import java.util.Arrays; +import java.util.List; import java.util.Map; import org.apache.lucene.analysis.Analyzer; @@ -907,7 +909,7 @@ IndexSearcher searcher = newSearcher(ir); PostingsHighlighter highlighter = new PostingsHighlighter() { @Override - protected PassageFormatter getFormatter(String field) { + protected PassageFormatter getFormatter(String field) { return new DefaultPassageFormatter("", "", "... ", true); } }; @@ -965,4 +967,55 @@ ir.close(); dir.close(); } + + // LUCENE-4906 + public void testObjectFormatter() throws Exception { + Directory dir = newDirectory(); + IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())); + iwc.setMergePolicy(newLogMergePolicy()); + RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); + + FieldType offsetsType = new FieldType(TextField.TYPE_STORED); + offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); + Field body = new Field("body", "", offsetsType); + Document doc = new Document(); + doc.add(body); + + body.setStringValue("This is a test. Just a test highlighting from postings. Feel free to ignore."); + iw.addDocument(doc); + + IndexReader ir = iw.getReader(); + iw.close(); + + IndexSearcher searcher = newSearcher(ir); + GenericPostingsHighlighter> highlighter = new GenericPostingsHighlighter>() { + @Override + protected PassageFormatter> getFormatter(String field) { + return new PassageFormatter>() { + PassageFormatter defaultFormatter = new DefaultPassageFormatter(); + + @Override + public List format(Passage passages[], String content) { + // Just turns the String snippet into a length 2 + // array of String + String[] result = new String[] {"blah blah", defaultFormatter.format(passages, content).toString()}; + return Arrays.asList(result); + } + }; + } + }; + + Query query = new TermQuery(new Term("body", "highlighting")); + TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER); + assertEquals(1, topDocs.totalHits); + int[] docIDs = new int[1]; + docIDs[0] = topDocs.scoreDocs[0].doc; + Map>> snippets = highlighter.highlightFieldsAsObjects(new String[]{"body"}, query, searcher, docIDs, new int[]{1}); + List> bodySnippets = snippets.get("body"); + assertEquals(1, bodySnippets.size()); + assertTrue(Arrays.equals(new String[]{"blah blah", "Just a test highlighting from postings. "}, bodySnippets.get(0).toArray())); + + ir.close(); + dir.close(); + } } Index: lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/GenericPostingsHighlighter.java =================================================================== --- lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/GenericPostingsHighlighter.java (revision 0) +++ lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/GenericPostingsHighlighter.java (revision 0) @@ -0,0 +1,580 @@ +package org.apache.lucene.search.postingshighlight; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.text.BreakIterator; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Comparator; +import java.util.HashMap; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.PriorityQueue; +import java.util.SortedSet; +import java.util.TreeSet; + +import org.apache.lucene.index.AtomicReader; +import org.apache.lucene.index.AtomicReaderContext; +import org.apache.lucene.index.DocsAndPositionsEnum; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.FieldInfo.IndexOptions; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexReaderContext; +import org.apache.lucene.index.MultiReader; +import org.apache.lucene.index.ReaderUtil; +import org.apache.lucene.index.StoredFieldVisitor; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.InPlaceMergeSorter; +import org.apache.lucene.util.UnicodeUtil; + +/** + * Simple highlighter that does not analyze fields nor use + * term vectors. Instead it requires + * {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}. + *

+ * PostingsHighlighter treats the single original document as the whole corpus, and then scores individual + * passages as if they were documents in this corpus. It uses a {@link BreakIterator} to find + * passages in the text; by default it breaks using {@link BreakIterator#getSentenceInstance(Locale) + * getSentenceInstance(Locale.ROOT)}. It then iterates in parallel (merge sorting by offset) through + * the positions of all terms from the query, coalescing those hits that occur in a single passage + * into a {@link Passage}, and then scores each Passage using a separate {@link PassageScorer}. + * Passages are finally formatted into highlighted snippets with a {@link PassageFormatter}. + *

+ * WARNING: The code is very new and probably still has some exciting bugs! + *

+ * Example usage: + *

+ *   // configure field with offsets at index time
+ *   FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
+ *   offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
+ *   Field body = new Field("body", "foobar", offsetsType);
+ *
+ *   // retrieve highlights at query time 
+ *   PostingsHighlighter highlighter = new PostingsHighlighter();
+ *   Query query = new TermQuery(new Term("body", "highlighting"));
+ *   TopDocs topDocs = searcher.search(query, n);
+ *   String highlights[] = highlighter.highlight("body", query, searcher, topDocs);
+ * 
+ *

+ * This is thread-safe, and can be used across different readers. + * @lucene.experimental + */ +public abstract class GenericPostingsHighlighter { + + // TODO: maybe allow re-analysis for tiny fields? currently we require offsets, + // but if the analyzer is really fast and the field is tiny, this might really be + // unnecessary. + + /** for rewriting: we don't want slow processing from MTQs */ + private static final IndexReader EMPTY_INDEXREADER = new MultiReader(); + + /** Default maximum content size to process. Typically snippets + * closer to the beginning of the document better summarize its content */ + public static final int DEFAULT_MAX_LENGTH = 10000; + + private final int maxLength; + + /** Set the first time {@link #getScorer} is called, + * and then reused. */ + private PassageScorer defaultScorer; + + /** + * Creates a new highlighter with default parameters. + */ + public GenericPostingsHighlighter() { + this(DEFAULT_MAX_LENGTH); + } + + /** + * Creates a new highlighter, specifying maximum content length. + * @param maxLength maximum content size to process. + * @throws IllegalArgumentException if maxLength is negative or Integer.MAX_VALUE + */ + public GenericPostingsHighlighter(int maxLength) { + if (maxLength < 0 || maxLength == Integer.MAX_VALUE) { + // two reasons: no overflow problems in BreakIterator.preceding(offset+1), + // our sentinel in the offsets queue uses this value to terminate. + throw new IllegalArgumentException("maxLength must be < Integer.MAX_VALUE"); + } + this.maxLength = maxLength; + } + + /** Returns the {@link BreakIterator} to use for + * dividing text into passages. This returns + * {@link BreakIterator#getSentenceInstance(Locale)} by default; + * subclasses can override to customize. */ + protected BreakIterator getBreakIterator(String field) { + return BreakIterator.getSentenceInstance(Locale.ROOT); + } + + /** Returns the {@link PassageFormatter} to use for + * formatting passages into highlighted snippets. This + * returns a new {@code PassageFormatter} by default; + * subclasses can override to customize. */ + protected abstract PassageFormatter getFormatter(String field); + + /** Returns the {@link PassageScorer} to use for + * ranking passages. This + * returns a new {@code PassageScorer} by default; + * subclasses can override to customize. */ + protected PassageScorer getScorer(String field) { + if (defaultScorer == null) { + defaultScorer = new PassageScorer(); + } + return defaultScorer; + } + + // nocommit javadocs + public Map> highlightFieldsAsObjects(String fieldsIn[], Query query, IndexSearcher searcher, int[] docidsIn, int maxPassagesIn[]) throws IOException { + if (fieldsIn.length < 1) { + throw new IllegalArgumentException("fieldsIn must not be empty"); + } + if (fieldsIn.length != maxPassagesIn.length) { + throw new IllegalArgumentException("invalid number of maxPassagesIn"); + } + final IndexReader reader = searcher.getIndexReader(); + query = rewrite(query); + SortedSet queryTerms = new TreeSet(); + query.extractTerms(queryTerms); + + IndexReaderContext readerContext = reader.getContext(); + List leaves = readerContext.leaves(); + + // Make our own copies because we sort in-place: + int[] docids = new int[docidsIn.length]; + System.arraycopy(docidsIn, 0, docids, 0, docidsIn.length); + final String fields[] = new String[fieldsIn.length]; + System.arraycopy(fieldsIn, 0, fields, 0, fieldsIn.length); + final int maxPassages[] = new int[maxPassagesIn.length]; + System.arraycopy(maxPassagesIn, 0, maxPassages, 0, maxPassagesIn.length); + + // sort for sequential io + Arrays.sort(docids); + new InPlaceMergeSorter() { + + @Override + protected void swap(int i, int j) { + String tmp = fields[i]; + fields[i] = fields[j]; + fields[j] = tmp; + int tmp2 = maxPassages[i]; + maxPassages[i] = maxPassages[j]; + maxPassages[j] = tmp2; + } + + @Override + protected int compare(int i, int j) { + return fields[i].compareTo(fields[j]); + } + + }.sort(0, fields.length); + + // pull stored data: + String[][] contents = loadFieldValues(searcher, fields, docids, maxLength); + + Map> highlights = new HashMap>(); + for (int i = 0; i < fields.length; i++) { + String field = fields[i]; + int numPassages = maxPassages[i]; + Term floor = new Term(field, ""); + Term ceiling = new Term(field, UnicodeUtil.BIG_TERM); + SortedSet fieldTerms = queryTerms.subSet(floor, ceiling); + // TODO: should we have some reasonable defaults for term pruning? (e.g. stopwords) + + // Strip off the redundant field: + BytesRef terms[] = new BytesRef[fieldTerms.size()]; + int termUpto = 0; + for(Term term : fieldTerms) { + terms[termUpto++] = term.bytes(); + } + Map fieldHighlights = highlightField(field, contents[i], getBreakIterator(field), terms, docids, leaves, numPassages); + + List formattedPassages = new ArrayList(docids.length); + for (int docid : docidsIn) { + formattedPassages.add(fieldHighlights.get(docid)); + } + highlights.put(field, formattedPassages); + } + return highlights; + } + + /** Loads the String values for each field X docID to be + * highlighted. By default this loads from stored + * fields, but a subclass can change the source. This + * method should allocate the String[fields.length][docids.length] + * and fill all values. The returned Strings must be + * identical to what was indexed. */ + protected String[][] loadFieldValues(IndexSearcher searcher, String[] fields, int[] docids, int maxLength) throws IOException { + String contents[][] = new String[fields.length][docids.length]; + char valueSeparators[] = new char[fields.length]; + for (int i = 0; i < fields.length; i++) { + valueSeparators[i] = getMultiValuedSeparator(fields[i]); + } + LimitedStoredFieldVisitor visitor = new LimitedStoredFieldVisitor(fields, valueSeparators, maxLength); + for (int i = 0; i < docids.length; i++) { + searcher.doc(docids[i], visitor); + for (int j = 0; j < fields.length; j++) { + contents[j][i] = visitor.getValue(j).toString(); + } + visitor.reset(); + } + return contents; + } + + /** + * Returns the logical separator between values for multi-valued fields. + * The default value is a space character, which means passages can span across values, + * but a subclass can override, for example with {@code U+2029 PARAGRAPH SEPARATOR (PS)} + * if each value holds a discrete passage for highlighting. + */ + protected char getMultiValuedSeparator(String field) { + return ' '; + } + + private Map highlightField(String field, String contents[], BreakIterator bi, BytesRef terms[], int[] docids, List leaves, int maxPassages) throws IOException { + Map highlights = new HashMap(); + + // reuse in the real sense... for docs in same segment we just advance our old enum + DocsAndPositionsEnum postings[] = null; + TermsEnum termsEnum = null; + int lastLeaf = -1; + + PassageFormatter fieldFormatter = getFormatter(field); + if (fieldFormatter == null) { + throw new NullPointerException("PassageFormatter cannot be null"); + } + + for (int i = 0; i < docids.length; i++) { + String content = contents[i]; + if (content.length() == 0) { + continue; // nothing to do + } + bi.setText(content); + int doc = docids[i]; + int leaf = ReaderUtil.subIndex(doc, leaves); + AtomicReaderContext subContext = leaves.get(leaf); + AtomicReader r = subContext.reader(); + Terms t = r.terms(field); + if (t == null) { + continue; // nothing to do + } + if (leaf != lastLeaf) { + termsEnum = t.iterator(null); + postings = new DocsAndPositionsEnum[terms.length]; + } + Passage passages[] = highlightDoc(field, terms, content.length(), bi, doc - subContext.docBase, termsEnum, postings, maxPassages); + if (passages.length == 0) { + passages = getEmptyHighlight(field, bi, maxPassages); + } + if (passages.length > 0) { + // otherwise a null snippet (eg if field is missing + // entirely from the doc) + highlights.put(doc, fieldFormatter.format(passages, content)); + } + lastLeaf = leaf; + } + + return highlights; + } + + // algorithm: treat sentence snippets as miniature documents + // we can intersect these with the postings lists via BreakIterator.preceding(offset),s + // score each sentence as norm(sentenceStartOffset) * sum(weight * tf(freq)) + private Passage[] highlightDoc(String field, BytesRef terms[], int contentLength, BreakIterator bi, int doc, + TermsEnum termsEnum, DocsAndPositionsEnum[] postings, int n) throws IOException { + PassageScorer scorer = getScorer(field); + if (scorer == null) { + throw new NullPointerException("PassageScorer cannot be null"); + } + PriorityQueue pq = new PriorityQueue(); + float weights[] = new float[terms.length]; + // initialize postings + for (int i = 0; i < terms.length; i++) { + DocsAndPositionsEnum de = postings[i]; + int pDoc; + if (de == EMPTY) { + continue; + } else if (de == null) { + postings[i] = EMPTY; // initially + if (!termsEnum.seekExact(terms[i])) { + continue; // term not found + } + de = postings[i] = termsEnum.docsAndPositions(null, null, DocsAndPositionsEnum.FLAG_OFFSETS); + if (de == null) { + // no positions available + throw new IllegalArgumentException("field '" + field + "' was indexed without offsets, cannot highlight"); + } + pDoc = de.advance(doc); + } else { + pDoc = de.docID(); + if (pDoc < doc) { + pDoc = de.advance(doc); + } + } + + if (doc == pDoc) { + weights[i] = scorer.weight(contentLength, de.freq()); + de.nextPosition(); + pq.add(new OffsetsEnum(de, i)); + } + } + + pq.add(new OffsetsEnum(EMPTY, Integer.MAX_VALUE)); // a sentinel for termination + + PriorityQueue passageQueue = new PriorityQueue(n, new Comparator() { + @Override + public int compare(Passage left, Passage right) { + if (left.score < right.score) { + return -1; + } else if (left.score > right.score) { + return 1; + } else { + return left.startOffset - right.startOffset; + } + } + }); + Passage current = new Passage(); + + OffsetsEnum off; + while ((off = pq.poll()) != null) { + final DocsAndPositionsEnum dp = off.dp; + int start = dp.startOffset(); + if (start == -1) { + throw new IllegalArgumentException("field '" + field + "' was indexed without offsets, cannot highlight"); + } + int end = dp.endOffset(); + if (start >= current.endOffset) { + if (current.startOffset >= 0) { + // finalize current + current.score *= scorer.norm(current.startOffset); + // new sentence: first add 'current' to queue + if (passageQueue.size() == n && current.score < passageQueue.peek().score) { + current.reset(); // can't compete, just reset it + } else { + passageQueue.offer(current); + if (passageQueue.size() > n) { + current = passageQueue.poll(); + current.reset(); + } else { + current = new Passage(); + } + } + } + // if we exceed limit, we are done + if (start >= contentLength) { + Passage passages[] = new Passage[passageQueue.size()]; + passageQueue.toArray(passages); + for (Passage p : passages) { + p.sort(); + } + // sort in ascending order + Arrays.sort(passages, new Comparator() { + @Override + public int compare(Passage left, Passage right) { + return left.startOffset - right.startOffset; + } + }); + return passages; + } + // advance breakiterator + assert BreakIterator.DONE < 0; + current.startOffset = Math.max(bi.preceding(start+1), 0); + current.endOffset = Math.min(bi.next(), contentLength); + } + int tf = 0; + while (true) { + tf++; + current.addMatch(start, end, terms[off.id]); + if (off.pos == dp.freq()) { + break; // removed from pq + } else { + off.pos++; + dp.nextPosition(); + start = dp.startOffset(); + end = dp.endOffset(); + } + if (start >= current.endOffset) { + pq.offer(off); + break; + } + } + current.score += weights[off.id] * scorer.tf(tf, current.endOffset - current.startOffset); + } + + // Dead code but compiler disagrees: + assert false; + return null; + } + + /** Called to summarize a document when no hits were + * found. By default this just returns the first + * {@code maxPassages} sentences; subclasses can override + * to customize. */ + protected Passage[] getEmptyHighlight(String fieldName, BreakIterator bi, int maxPassages) { + // BreakIterator should be un-next'd: + List passages = new ArrayList(); + int pos = bi.current(); + assert pos == 0; + while (passages.size() < maxPassages) { + int next = bi.next(); + if (next == BreakIterator.DONE) { + break; + } + Passage passage = new Passage(); + passage.score = Float.NaN; + passage.startOffset = pos; + passage.endOffset = next; + passages.add(passage); + pos = next; + } + + return passages.toArray(new Passage[passages.size()]); + } + + private static class OffsetsEnum implements Comparable { + DocsAndPositionsEnum dp; + int pos; + int id; + + OffsetsEnum(DocsAndPositionsEnum dp, int id) throws IOException { + this.dp = dp; + this.id = id; + this.pos = 1; + } + + @Override + public int compareTo(OffsetsEnum other) { + try { + int off = dp.startOffset(); + int otherOff = other.dp.startOffset(); + if (off == otherOff) { + return id - other.id; + } else { + return Long.signum(((long)off) - otherOff); + } + } catch (IOException e) { + throw new RuntimeException(e); + } + } + } + + private static final DocsAndPositionsEnum EMPTY = new DocsAndPositionsEnum() { + + @Override + public int nextPosition() throws IOException { return 0; } + + @Override + public int startOffset() throws IOException { return Integer.MAX_VALUE; } + + @Override + public int endOffset() throws IOException { return Integer.MAX_VALUE; } + + @Override + public BytesRef getPayload() throws IOException { return null; } + + @Override + public int freq() throws IOException { return 0; } + + @Override + public int docID() { return NO_MORE_DOCS; } + + @Override + public int nextDoc() throws IOException { return NO_MORE_DOCS; } + + @Override + public int advance(int target) throws IOException { return NO_MORE_DOCS; } + + @Override + public long cost() { return 0; } + }; + + /** + * we rewrite against an empty indexreader: as we don't want things like + * rangeQueries that don't summarize the document + */ + private static Query rewrite(Query original) throws IOException { + Query query = original; + for (Query rewrittenQuery = query.rewrite(EMPTY_INDEXREADER); rewrittenQuery != query; + rewrittenQuery = query.rewrite(EMPTY_INDEXREADER)) { + query = rewrittenQuery; + } + return query; + } + + private static class LimitedStoredFieldVisitor extends StoredFieldVisitor { + private final String fields[]; + private final char valueSeparators[]; + private final int maxLength; + private final StringBuilder builders[]; + private int currentField = -1; + + public LimitedStoredFieldVisitor(String fields[], char valueSeparators[], int maxLength) { + assert fields.length == valueSeparators.length; + this.fields = fields; + this.valueSeparators = valueSeparators; + this.maxLength = maxLength; + builders = new StringBuilder[fields.length]; + for (int i = 0; i < builders.length; i++) { + builders[i] = new StringBuilder(); + } + } + + @Override + public void stringField(FieldInfo fieldInfo, String value) throws IOException { + assert currentField >= 0; + StringBuilder builder = builders[currentField]; + if (builder.length() > 0 && builder.length() < maxLength) { + builder.append(valueSeparators[currentField]); + } + if (builder.length() + value.length() > maxLength) { + builder.append(value, 0, maxLength - builder.length()); + } else { + builder.append(value); + } + } + + @Override + public Status needsField(FieldInfo fieldInfo) throws IOException { + currentField = Arrays.binarySearch(fields, fieldInfo.name); + if (currentField < 0) { + return Status.NO; + } else if (builders[currentField].length() > maxLength) { + return fields.length == 1 ? Status.STOP : Status.NO; + } + return Status.YES; + } + + String getValue(int i) { + return builders[i].toString(); + } + + void reset() { + currentField = -1; + for (int i = 0; i < fields.length; i++) { + builders[i].setLength(0); + } + } + } +} Index: lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PostingsHighlighter.java =================================================================== --- lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PostingsHighlighter.java (revision 1512250) +++ lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PostingsHighlighter.java (working copy) @@ -19,37 +19,17 @@ import java.io.IOException; import java.text.BreakIterator; -import java.util.ArrayList; import java.util.Arrays; -import java.util.Comparator; import java.util.HashMap; import java.util.List; import java.util.Locale; import java.util.Map; -import java.util.PriorityQueue; -import java.util.SortedSet; -import java.util.TreeSet; -import org.apache.lucene.index.AtomicReader; -import org.apache.lucene.index.AtomicReaderContext; -import org.apache.lucene.index.DocsAndPositionsEnum; -import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FieldInfo.IndexOptions; -import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.IndexReaderContext; -import org.apache.lucene.index.MultiReader; -import org.apache.lucene.index.ReaderUtil; -import org.apache.lucene.index.StoredFieldVisitor; -import org.apache.lucene.index.Term; -import org.apache.lucene.index.Terms; -import org.apache.lucene.index.TermsEnum; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TopDocs; -import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.InPlaceMergeSorter; -import org.apache.lucene.util.UnicodeUtil; /** * Simple highlighter that does not analyze fields nor use @@ -83,34 +63,15 @@ * This is thread-safe, and can be used across different readers. * @lucene.experimental */ -public class PostingsHighlighter { +public class PostingsHighlighter extends GenericPostingsHighlighter { - // TODO: maybe allow re-analysis for tiny fields? currently we require offsets, - // but if the analyzer is really fast and the field is tiny, this might really be - // unnecessary. - - /** for rewriting: we don't want slow processing from MTQs */ - private static final IndexReader EMPTY_INDEXREADER = new MultiReader(); - - /** Default maximum content size to process. Typically snippets - * closer to the beginning of the document better summarize its content */ - public static final int DEFAULT_MAX_LENGTH = 10000; - - private final int maxLength; + private PassageFormatter defaultFormatter; - /** Set the first time {@link #getFormatter} is called, - * and then reused. */ - private PassageFormatter defaultFormatter; - - /** Set the first time {@link #getScorer} is called, - * and then reused. */ - private PassageScorer defaultScorer; - /** * Creates a new highlighter with default parameters. */ public PostingsHighlighter() { - this(DEFAULT_MAX_LENGTH); + super(); } /** @@ -119,44 +80,20 @@ * @throws IllegalArgumentException if maxLength is negative or Integer.MAX_VALUE */ public PostingsHighlighter(int maxLength) { - if (maxLength < 0 || maxLength == Integer.MAX_VALUE) { - // two reasons: no overflow problems in BreakIterator.preceding(offset+1), - // our sentinel in the offsets queue uses this value to terminate. - throw new IllegalArgumentException("maxLength must be < Integer.MAX_VALUE"); - } - this.maxLength = maxLength; + super(maxLength); } - - /** Returns the {@link BreakIterator} to use for - * dividing text into passages. This returns - * {@link BreakIterator#getSentenceInstance(Locale)} by default; - * subclasses can override to customize. */ - protected BreakIterator getBreakIterator(String field) { - return BreakIterator.getSentenceInstance(Locale.ROOT); - } /** Returns the {@link PassageFormatter} to use for * formatting passages into highlighted snippets. This * returns a new {@code PassageFormatter} by default; * subclasses can override to customize. */ - protected PassageFormatter getFormatter(String field) { + protected PassageFormatter getFormatter(String field) { if (defaultFormatter == null) { defaultFormatter = new DefaultPassageFormatter(); } return defaultFormatter; } - /** Returns the {@link PassageScorer} to use for - * ranking passages. This - * returns a new {@code PassageScorer} by default; - * subclasses can override to customize. */ - protected PassageScorer getScorer(String field) { - if (defaultScorer == null) { - defaultScorer = new PassageScorer(); - } - return defaultScorer; - } - /** * Highlights the top passages from a single field. * @@ -267,7 +204,7 @@ return highlightFields(fields, query, searcher, docids, maxPassages); } - + /** * Highlights the top-N passages from multiple fields, * for the provided int[] docids. @@ -289,433 +226,11 @@ * {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS} */ public Map highlightFields(String fieldsIn[], Query query, IndexSearcher searcher, int[] docidsIn, int maxPassagesIn[]) throws IOException { - if (fieldsIn.length < 1) { - throw new IllegalArgumentException("fieldsIn must not be empty"); + Map snippets = new HashMap(); + for(Map.Entry> ent : highlightFieldsAsObjects(fieldsIn, query, searcher, docidsIn, maxPassagesIn).entrySet()) { + List snippetStrings = ent.getValue(); + snippets.put(ent.getKey(), ent.getValue().toArray(new String[snippetStrings.size()])); } - if (fieldsIn.length != maxPassagesIn.length) { - throw new IllegalArgumentException("invalid number of maxPassagesIn"); - } - final IndexReader reader = searcher.getIndexReader(); - query = rewrite(query); - SortedSet queryTerms = new TreeSet(); - query.extractTerms(queryTerms); - - IndexReaderContext readerContext = reader.getContext(); - List leaves = readerContext.leaves(); - - // Make our own copies because we sort in-place: - int[] docids = new int[docidsIn.length]; - System.arraycopy(docidsIn, 0, docids, 0, docidsIn.length); - final String fields[] = new String[fieldsIn.length]; - System.arraycopy(fieldsIn, 0, fields, 0, fieldsIn.length); - final int maxPassages[] = new int[maxPassagesIn.length]; - System.arraycopy(maxPassagesIn, 0, maxPassages, 0, maxPassagesIn.length); - - // sort for sequential io - Arrays.sort(docids); - new InPlaceMergeSorter() { - - @Override - protected void swap(int i, int j) { - String tmp = fields[i]; - fields[i] = fields[j]; - fields[j] = tmp; - int tmp2 = maxPassages[i]; - maxPassages[i] = maxPassages[j]; - maxPassages[j] = tmp2; - } - - @Override - protected int compare(int i, int j) { - return fields[i].compareTo(fields[j]); - } - - }.sort(0, fields.length); - - // pull stored data: - String[][] contents = loadFieldValues(searcher, fields, docids, maxLength); - - Map highlights = new HashMap(); - for (int i = 0; i < fields.length; i++) { - String field = fields[i]; - int numPassages = maxPassages[i]; - Term floor = new Term(field, ""); - Term ceiling = new Term(field, UnicodeUtil.BIG_TERM); - SortedSet fieldTerms = queryTerms.subSet(floor, ceiling); - // TODO: should we have some reasonable defaults for term pruning? (e.g. stopwords) - - // Strip off the redundant field: - BytesRef terms[] = new BytesRef[fieldTerms.size()]; - int termUpto = 0; - for(Term term : fieldTerms) { - terms[termUpto++] = term.bytes(); - } - Map fieldHighlights = highlightField(field, contents[i], getBreakIterator(field), terms, docids, leaves, numPassages); - - String[] result = new String[docids.length]; - for (int j = 0; j < docidsIn.length; j++) { - result[j] = fieldHighlights.get(docidsIn[j]); - } - highlights.put(field, result); - } - return highlights; + return snippets; } - - /** Loads the String values for each field X docID to be - * highlighted. By default this loads from stored - * fields, but a subclass can change the source. This - * method should allocate the String[fields.length][docids.length] - * and fill all values. The returned Strings must be - * identical to what was indexed. */ - protected String[][] loadFieldValues(IndexSearcher searcher, String[] fields, int[] docids, int maxLength) throws IOException { - String contents[][] = new String[fields.length][docids.length]; - char valueSeparators[] = new char[fields.length]; - for (int i = 0; i < fields.length; i++) { - valueSeparators[i] = getMultiValuedSeparator(fields[i]); - } - LimitedStoredFieldVisitor visitor = new LimitedStoredFieldVisitor(fields, valueSeparators, maxLength); - for (int i = 0; i < docids.length; i++) { - searcher.doc(docids[i], visitor); - for (int j = 0; j < fields.length; j++) { - contents[j][i] = visitor.getValue(j).toString(); - } - visitor.reset(); - } - return contents; - } - - /** - * Returns the logical separator between values for multi-valued fields. - * The default value is a space character, which means passages can span across values, - * but a subclass can override, for example with {@code U+2029 PARAGRAPH SEPARATOR (PS)} - * if each value holds a discrete passage for highlighting. - */ - protected char getMultiValuedSeparator(String field) { - return ' '; - } - - private Map highlightField(String field, String contents[], BreakIterator bi, BytesRef terms[], int[] docids, List leaves, int maxPassages) throws IOException { - Map highlights = new HashMap(); - - // reuse in the real sense... for docs in same segment we just advance our old enum - DocsAndPositionsEnum postings[] = null; - TermsEnum termsEnum = null; - int lastLeaf = -1; - - PassageFormatter fieldFormatter = getFormatter(field); - if (fieldFormatter == null) { - throw new NullPointerException("PassageFormatter cannot be null"); - } - - for (int i = 0; i < docids.length; i++) { - String content = contents[i]; - if (content.length() == 0) { - continue; // nothing to do - } - bi.setText(content); - int doc = docids[i]; - int leaf = ReaderUtil.subIndex(doc, leaves); - AtomicReaderContext subContext = leaves.get(leaf); - AtomicReader r = subContext.reader(); - Terms t = r.terms(field); - if (t == null) { - continue; // nothing to do - } - if (leaf != lastLeaf) { - termsEnum = t.iterator(null); - postings = new DocsAndPositionsEnum[terms.length]; - } - Passage passages[] = highlightDoc(field, terms, content.length(), bi, doc - subContext.docBase, termsEnum, postings, maxPassages); - if (passages.length == 0) { - passages = getEmptyHighlight(field, bi, maxPassages); - } - if (passages.length > 0) { - // otherwise a null snippet (eg if field is missing - // entirely from the doc) - highlights.put(doc, fieldFormatter.format(passages, content)); - } - lastLeaf = leaf; - } - - return highlights; - } - - // algorithm: treat sentence snippets as miniature documents - // we can intersect these with the postings lists via BreakIterator.preceding(offset),s - // score each sentence as norm(sentenceStartOffset) * sum(weight * tf(freq)) - private Passage[] highlightDoc(String field, BytesRef terms[], int contentLength, BreakIterator bi, int doc, - TermsEnum termsEnum, DocsAndPositionsEnum[] postings, int n) throws IOException { - PassageScorer scorer = getScorer(field); - if (scorer == null) { - throw new NullPointerException("PassageScorer cannot be null"); - } - PriorityQueue pq = new PriorityQueue(); - float weights[] = new float[terms.length]; - // initialize postings - for (int i = 0; i < terms.length; i++) { - DocsAndPositionsEnum de = postings[i]; - int pDoc; - if (de == EMPTY) { - continue; - } else if (de == null) { - postings[i] = EMPTY; // initially - if (!termsEnum.seekExact(terms[i])) { - continue; // term not found - } - de = postings[i] = termsEnum.docsAndPositions(null, null, DocsAndPositionsEnum.FLAG_OFFSETS); - if (de == null) { - // no positions available - throw new IllegalArgumentException("field '" + field + "' was indexed without offsets, cannot highlight"); - } - pDoc = de.advance(doc); - } else { - pDoc = de.docID(); - if (pDoc < doc) { - pDoc = de.advance(doc); - } - } - - if (doc == pDoc) { - weights[i] = scorer.weight(contentLength, de.freq()); - de.nextPosition(); - pq.add(new OffsetsEnum(de, i)); - } - } - - pq.add(new OffsetsEnum(EMPTY, Integer.MAX_VALUE)); // a sentinel for termination - - PriorityQueue passageQueue = new PriorityQueue(n, new Comparator() { - @Override - public int compare(Passage left, Passage right) { - if (left.score < right.score) { - return -1; - } else if (left.score > right.score) { - return 1; - } else { - return left.startOffset - right.startOffset; - } - } - }); - Passage current = new Passage(); - - OffsetsEnum off; - while ((off = pq.poll()) != null) { - final DocsAndPositionsEnum dp = off.dp; - int start = dp.startOffset(); - if (start == -1) { - throw new IllegalArgumentException("field '" + field + "' was indexed without offsets, cannot highlight"); - } - int end = dp.endOffset(); - if (start >= current.endOffset) { - if (current.startOffset >= 0) { - // finalize current - current.score *= scorer.norm(current.startOffset); - // new sentence: first add 'current' to queue - if (passageQueue.size() == n && current.score < passageQueue.peek().score) { - current.reset(); // can't compete, just reset it - } else { - passageQueue.offer(current); - if (passageQueue.size() > n) { - current = passageQueue.poll(); - current.reset(); - } else { - current = new Passage(); - } - } - } - // if we exceed limit, we are done - if (start >= contentLength) { - Passage passages[] = new Passage[passageQueue.size()]; - passageQueue.toArray(passages); - for (Passage p : passages) { - p.sort(); - } - // sort in ascending order - Arrays.sort(passages, new Comparator() { - @Override - public int compare(Passage left, Passage right) { - return left.startOffset - right.startOffset; - } - }); - return passages; - } - // advance breakiterator - assert BreakIterator.DONE < 0; - current.startOffset = Math.max(bi.preceding(start+1), 0); - current.endOffset = Math.min(bi.next(), contentLength); - } - int tf = 0; - while (true) { - tf++; - current.addMatch(start, end, terms[off.id]); - if (off.pos == dp.freq()) { - break; // removed from pq - } else { - off.pos++; - dp.nextPosition(); - start = dp.startOffset(); - end = dp.endOffset(); - } - if (start >= current.endOffset) { - pq.offer(off); - break; - } - } - current.score += weights[off.id] * scorer.tf(tf, current.endOffset - current.startOffset); - } - - // Dead code but compiler disagrees: - assert false; - return null; - } - - /** Called to summarize a document when no hits were - * found. By default this just returns the first - * {@code maxPassages} sentences; subclasses can override - * to customize. */ - protected Passage[] getEmptyHighlight(String fieldName, BreakIterator bi, int maxPassages) { - // BreakIterator should be un-next'd: - List passages = new ArrayList(); - int pos = bi.current(); - assert pos == 0; - while (passages.size() < maxPassages) { - int next = bi.next(); - if (next == BreakIterator.DONE) { - break; - } - Passage passage = new Passage(); - passage.score = Float.NaN; - passage.startOffset = pos; - passage.endOffset = next; - passages.add(passage); - pos = next; - } - - return passages.toArray(new Passage[passages.size()]); - } - - private static class OffsetsEnum implements Comparable { - DocsAndPositionsEnum dp; - int pos; - int id; - - OffsetsEnum(DocsAndPositionsEnum dp, int id) throws IOException { - this.dp = dp; - this.id = id; - this.pos = 1; - } - - @Override - public int compareTo(OffsetsEnum other) { - try { - int off = dp.startOffset(); - int otherOff = other.dp.startOffset(); - if (off == otherOff) { - return id - other.id; - } else { - return Long.signum(((long)off) - otherOff); - } - } catch (IOException e) { - throw new RuntimeException(e); - } - } - } - - private static final DocsAndPositionsEnum EMPTY = new DocsAndPositionsEnum() { - - @Override - public int nextPosition() throws IOException { return 0; } - - @Override - public int startOffset() throws IOException { return Integer.MAX_VALUE; } - - @Override - public int endOffset() throws IOException { return Integer.MAX_VALUE; } - - @Override - public BytesRef getPayload() throws IOException { return null; } - - @Override - public int freq() throws IOException { return 0; } - - @Override - public int docID() { return NO_MORE_DOCS; } - - @Override - public int nextDoc() throws IOException { return NO_MORE_DOCS; } - - @Override - public int advance(int target) throws IOException { return NO_MORE_DOCS; } - - @Override - public long cost() { return 0; } - }; - - /** - * we rewrite against an empty indexreader: as we don't want things like - * rangeQueries that don't summarize the document - */ - private static Query rewrite(Query original) throws IOException { - Query query = original; - for (Query rewrittenQuery = query.rewrite(EMPTY_INDEXREADER); rewrittenQuery != query; - rewrittenQuery = query.rewrite(EMPTY_INDEXREADER)) { - query = rewrittenQuery; - } - return query; - } - - private static class LimitedStoredFieldVisitor extends StoredFieldVisitor { - private final String fields[]; - private final char valueSeparators[]; - private final int maxLength; - private final StringBuilder builders[]; - private int currentField = -1; - - public LimitedStoredFieldVisitor(String fields[], char valueSeparators[], int maxLength) { - assert fields.length == valueSeparators.length; - this.fields = fields; - this.valueSeparators = valueSeparators; - this.maxLength = maxLength; - builders = new StringBuilder[fields.length]; - for (int i = 0; i < builders.length; i++) { - builders[i] = new StringBuilder(); - } - } - - @Override - public void stringField(FieldInfo fieldInfo, String value) throws IOException { - assert currentField >= 0; - StringBuilder builder = builders[currentField]; - if (builder.length() > 0 && builder.length() < maxLength) { - builder.append(valueSeparators[currentField]); - } - if (builder.length() + value.length() > maxLength) { - builder.append(value, 0, maxLength - builder.length()); - } else { - builder.append(value); - } - } - - @Override - public Status needsField(FieldInfo fieldInfo) throws IOException { - currentField = Arrays.binarySearch(fields, fieldInfo.name); - if (currentField < 0) { - return Status.NO; - } else if (builders[currentField].length() > maxLength) { - return fields.length == 1 ? Status.STOP : Status.NO; - } - return Status.YES; - } - - String getValue(int i) { - return builders[i].toString(); - } - - void reset() { - currentField = -1; - for (int i = 0; i < fields.length; i++) { - builders[i].setLength(0); - } - } - } } Index: lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PassageFormatter.java =================================================================== --- lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PassageFormatter.java (revision 1512250) +++ lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PassageFormatter.java (working copy) @@ -22,8 +22,10 @@ * * @lucene.experimental */ -public abstract class PassageFormatter { +public abstract class PassageFormatter { + // nocommit fixup javadocs + /** * Formats the top passages from content * into a human-readable text snippet. @@ -33,6 +35,6 @@ * @param content content for the field. * @return formatted highlight */ - public abstract String format(Passage passages[], String content); + public abstract FormattedPassage format(Passage passages[], String content); } Index: lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/DefaultPassageFormatter.java =================================================================== --- lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/DefaultPassageFormatter.java (revision 1512250) +++ lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/DefaultPassageFormatter.java (working copy) @@ -23,7 +23,7 @@ * The default implementation marks the query terms as bold, and places * ellipses between unconnected passages. */ -public class DefaultPassageFormatter extends PassageFormatter { +public class DefaultPassageFormatter extends PassageFormatter { /** text that will appear before highlighted terms */ protected final String preTag; /** text that will appear after highlighted terms */