Index: lucene/contrib/CHANGES.txt --- lucene/contrib/CHANGES.txt Wed May 25 06:27:40 2011 -0400 +++ lucene/contrib/CHANGES.txt Thu Jun 02 12:38:16 2011 -0400 @@ -106,6 +106,12 @@ case where the indexing rate is lowish but the reopen rate is highish, to take load off the IO system. (Mike McCandless) + * LUCENE-3129: Added BlockGroupingCollector, a single pass + grouping collector which is faster than the two-pass approach, and + also computes the total group count, but requires that every + document sharing the same group was indexed as a doc block + (IndexWriter.add/updateDocuments). (Mike McCandless) + Optimizations * LUCENE-3040: Switch all analysis consumers (highlighter, morelikethis, memory, ...) Index: modules/grouping/src/java/org/apache/lucene/search/grouping/BlockGroupingCollector.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ modules/grouping/src/java/org/apache/lucene/search/grouping/BlockGroupingCollector.java Thu Jun 02 12:38:16 2011 -0400 @@ -0,0 +1,516 @@ +package org.apache.lucene.search.grouping; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +import java.io.IOException; + +import org.apache.lucene.index.IndexReader.AtomicReaderContext; +import org.apache.lucene.index.IndexWriter; // javadocs +import org.apache.lucene.search.Collector; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.search.FieldComparator; +import org.apache.lucene.search.Filter; +import org.apache.lucene.search.Scorer; +import org.apache.lucene.search.Sort; +import org.apache.lucene.search.SortField; +import org.apache.lucene.search.TopDocs; +import org.apache.lucene.search.TopDocsCollector; +import org.apache.lucene.search.TopFieldCollector; +import org.apache.lucene.search.TopScoreDocCollector; +import org.apache.lucene.search.Weight; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.PriorityQueue; + +/** BlockGroupingCollector performs grouping with a + * single pass collector, as long as you are grouping by a + * doc block field, ie all documents sharing a given group + * value were indexed as a doc block using the atomic + * {@link IndexWriter#addDocuments} or {@link + * IndexWriter#updateDocuments} API. + * + *
This results in faster performance (~25% faster QPS) + * than the two-pass grouping collectors, with the tradeoff + * being that the documents in each group must always be + * indexed as a block. This collector also fills in + * TopGroups.totalGroupCount without requiring the separate + * {@link AllGroupsCollector}. However, this collector does + * not fill in the groupValue of each group; this field + * will always be null. + * + *
NOTE: this collector makes no effort to verify + * the docs were in fact indexed as a block, so it's up to + * you to ensure this was the case. + * + *
See {@link org.apache.lucene.search.grouping} for more + * details including a full code example.
+ * + * @lucene.experimental + */ + +public class BlockGroupingCollector extends Collector { + + private int[] pendingSubDocs; + private float[] pendingSubScores; + private int subDocUpto; + + private final Sort groupSort; + private final int topNGroups; + private final Filter lastDocPerGroup; + + // TODO: specialize into 2 classes, static "create" method: + private final boolean needsScores; + + private final FieldComparator[] comparators; + private final int[] reversed; + private final int compIDXEnd; + private int bottomSlot; + private boolean queueFull; + private AtomicReaderContext currentReaderContext; + + private int topGroupDoc; + private int totalHitCount; + private int totalGroupCount; + private int docBase; + private int groupEndDocID; + //private OpenBitSet lastDocPerGroupBits; + private DocIdSetIterator lastDocPerGroupBits; + private Scorer scorer; + private final GroupQueue groupQueue; + private boolean groupCompetes; + + private final static class FakeScorer extends Scorer { + + float score; + int doc; + + public FakeScorer() { + super((Weight) null); + } + + @Override + public float score() { + return score; + } + + @Override + public int docID() { + return doc; + } + + @Override + public int advance(int target) { + throw new UnsupportedOperationException(); + } + + @Override + public int nextDoc() { + throw new UnsupportedOperationException(); + } + } + + private static final class OneGroup { + AtomicReaderContext readerContext; + //int groupOrd; + int topGroupDoc; + int[] docs; + float[] scores; + int count; + int comparatorSlot; + } + + // Sorts by groupSort. Not static -- uses comparators, reversed + private final class GroupQueue extends PriorityQueueNOTE: This collector is unable to compute
+ * the groupValue per group so it will always be null.
+ * This is normally not a problem, as you can obtain the
+ * value just like you obtain other values for each
+ * matching document (eg, via stored fields, via
+ * FieldCache, etc.)
+ *
+ * @param withinGroupSort The {@link Sort} used to sort
+ * documents within each group. Passing null is
+ * allowed, to sort by relevance.
+ * @param groupOffset Which group to start from
+ * @param withinGroupOffset Which document to start from
+ * within each group
+ * @param maxDocsPerGroup How many top documents to keep
+ * within each group.
+ * @param fillSortFields If true then the Comparable
+ * values for the sort fields will be set
+ */
+ public TopGroups getTopGroups(Sort withinGroupSort, int groupOffset, int withinGroupOffset, int maxDocsPerGroup, boolean fillSortFields) throws IOException {
+
+ //if (queueFull) {
+ //System.out.println("getTopGroups groupOffset=" + groupOffset + " topNGroups=" + topNGroups);
+ //}
+ if (subDocUpto != 0) {
+ processGroup();
+ }
+ if (groupOffset >= groupQueue.size()) {
+ return null;
+ }
+ int totalGroupedHitCount = 0;
+
+ final FakeScorer fakeScorer = new FakeScorer();
+
+ final GroupDocs[] groups = new GroupDocs[groupQueue.size() - groupOffset];
+ for(int downTo=groupQueue.size()-groupOffset-1;downTo>=0;downTo--) {
+ final OneGroup og = groupQueue.pop();
+
+ // At this point we hold all docs w/ in each group,
+ // unsorted; we now sort them:
+ final TopDocsCollector collector;
+ if (withinGroupSort == null) {
+ // Sort by score
+ if (!needsScores) {
+ throw new IllegalArgumentException("cannot sort by relevance within group: needsScores=false");
+ }
+ collector = TopScoreDocCollector.create(maxDocsPerGroup, true);
+ } else {
+ // Sort by fields
+ collector = TopFieldCollector.create(withinGroupSort, maxDocsPerGroup, fillSortFields, needsScores, needsScores, true);
+ }
+
+ collector.setScorer(fakeScorer);
+ collector.setNextReader(og.readerContext);
+ for(int docIDX=0;docIDXauthor
+field, then all documents with the same value in the author
field fall into a single group.
Grouping requires a number of inputs:
groupField: this is the field used for grouping.
+ For example, if you use the author field then each
group has all books by the same author. Documents that don't
have this field are grouped under a single group with
- a null group value.
+ a null group value.
- groupSort: how the groups are sorted. For sorting
purposes, each group is "represented" by the highest-sorted
- document according to the groupSort within it. For
+ document according to the groupSort within it. For
example, if you specify "price" (ascending) then the first group
is the one with the lowest price book within it. Or if you
specify relevance group sort, then the first group is the one
containing the highest scoring book.
- topNGroups: how many top groups to keep. For
example, 10 means the top 10 groups are computed.
- groupOffset: which "slice" of top groups you want to
retrieve. For example, 3 means you'll get 7 groups back
- (assuming topNGroups is 10). This is useful for
+ (assuming topNGroups is 10). This is useful for
paging, where you might show 5 groups per page.
- withinGroupSort: how the documents within each group
are sorted. This can be different from the group sort.
- maxDocsPerGroup: how many top documents within each
group to keep.
- withinGroupOffset: which "slice" of top
documents you want to retrieve from each group.
The implementation is two-pass: the first pass ({@link - org.apache.lucene.search.grouping.FirstPassGroupingCollector}) - gathers the top groups, and the second pass ({@link - org.apache.lucene.search.grouping.SecondPassGroupingCollector}) - gathers documents within those groups. If the search is costly to - run you may want to use the {@link - org.apache.lucene.search.CachingCollector} class, which - caches hits and can (quickly) replay them for the second pass. This - way you only run the query once, but you pay a RAM cost to (briefly) - hold all hits. Results are returned as a {@link - org.apache.lucene.search.grouping.TopGroups} instance.
++There are two grouping implementations here: +
BlockGroupingCollectorDoc) that
+ is able to group according to the doc blocks created during
+ indexing using IndexWriter's add/updateDocuments API.
+ This is faster (~25% faster QPS) than the generic two-pass
+ collector, but it only works for doc blocks so you must statically
+ commit (during indexing) to which grouping you'll need at search
+ time.
+
+ This implementation does not rely on a single valued grouping
+ field; rather, the blocks in the index define the groups, so your
+ application is free to determine what the grouping criteria is.
+ At search time, you must provide a Filter that marks
+ the last document in each group. This is a substantial memory
+ savings because this collector does not load
+ a DocTermsIndex from the
+ FieldCache.
+
The benefit of the arbitrary grouping implementation is you don't have
+to commit at indexing time to a static grouping of your documents.
+But the downside is it's somewhat slower to run, and requires more RAM
+(a FieldCache.DocTermsIndex entry is created).
Known limitations:
Typical usage looks like this (using the {@link org.apache.lucene.search.CachingCollector}):
+Typical usage for the generic two-pass collector looks like this + (using the {@link org.apache.lucene.search.CachingCollector}):
FirstPassGroupingCollector c1 = new FirstPassGroupingCollector("author", groupSort, groupOffset+topNGroups);
@@ -111,5 +142,50 @@
// Render groupsResult...
+To use the single-pass BlockGroupingCollector,
+ first, at indexing time, you must ensure all docs in each group
+ are added as a block, and you have some way to find the last
+ document of each group. One simple way to do this is to add a
+ marker binary field:
+ // Create Documents from your source:
+ List<Document> oneGroup = ...;
+
+ Field groupEndField = new Field("groupEnd", "x", Field.Store.NO, Field.Index.NOT_ANALYZED);
+ groupEndField.setOmitTermFreqAndPositions(true);
+ groupEndField.setOmitNorms(true);
+ oneGroup.get(oneGroup.size()-1).add(groupEndField);
+
+ // You can also use writer.updateDocuments(); just be sure you
+ // replace an entire previous doc block with this new one. For
+ // example, each group could have a "groupID" field, with the same
+ // value for all docs in this group:
+ writer.addDocuments(oneGroup);
+
+
+Then, at search time, do this up front:
+
+
+ // Set this once in your app & save away for reusing across all queries:
+ Filter groupEndDocs = new CachingWrapperFilter(new QueryWrapperFilter(new TermQuery(new Term("end", "x"))));
+
+
+Finally, do this per search:
+
+
+ // Per search:
+ BlockGroupingCollector c = new BlockGroupingCollector(groupSort, groupOffset+topNGroups, needsScores, groupEndDocs);
+ s.search(new TermQuery(new Term("content", searchTerm)), c);
+ TopGroups groupsResult = c.getTopGroups(withinGroupSort, groupOffset, docOffset, docOffset+docsPerGroup, fillFields);
+
+ // Render groupsResult...
+
+
+Note that the groupValue of each GroupDocs
+will be null, so if you need to present this value you'll
+have to separately retrieve it (for example using stored
+fields, FieldCache, etc.).
+