Index: lucene/contrib/grouping/src/java/org/apache/lucene/search/grouping/package.html =================================================================== --- lucene/contrib/grouping/src/java/org/apache/lucene/search/grouping/package.html (revision 1103147) +++ lucene/contrib/grouping/src/java/org/apache/lucene/search/grouping/package.html (revision ) @@ -88,6 +88,13 @@ boolean fillFields = true; SecondPassGroupingCollector c2 = new SecondPassGroupingCollector("author", topGroups, groupSort, docSort, docOffset+docsPerGroup, getScores, getMaxScores, fillFields); + //Optionally compute total group count + AllGroupsCollector allGroupsCollector = null; + if (requiredTotalGroupCount) { + allGroupsCollector = new AllGroupsCollector("author"); + c2 = MultiCollector.wrap(c2, allGroupsCollector); + } + if (cachedCollector.isCached()) { // Cache fit within maxCacheRAMMB, so we can replay it: cachedCollector.replay(c2); @@ -95,8 +102,11 @@ // Cache was too large; must re-execute query: s.search(new TermQuery(new Term("content", searchTerm)), c2); } - + TopGroups groupsResult = c2.getTopGroups(docOffset); + if (requiredTotalGroupCount) { + groupResult = new TopGroups(groupsResult, allGroupsCollector.getGroupCount()); + } // Render groupsResult... Index: lucene/contrib/grouping/src/java/org/apache/lucene/search/grouping/AllGroupsCollector.java =================================================================== --- lucene/contrib/grouping/src/java/org/apache/lucene/search/grouping/AllGroupsCollector.java (revision ) +++ lucene/contrib/grouping/src/java/org/apache/lucene/search/grouping/AllGroupsCollector.java (revision ) @@ -0,0 +1,123 @@ +package org.apache.lucene.search.grouping; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.search.Collector; +import org.apache.lucene.search.FieldCache; +import org.apache.lucene.search.Scorer; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; + +/** + * A collector that collects all groups that match the query. Only the group value is collected. It can't determine + * the most relevant document of a group. + *

+ * Internally the {@link SentinelIntSet} is responsible for detecting if a group is already added to the total count. + * For each segment the {@link SentinelIntSet} is cleared and filled with previous counted groups that occur in + * the new segment. + * + * @lucene.experimental + */ +public class AllGroupsCollector extends Collector { + + private static final int DEFAULT_INITIAL_SIZE = 128; + + private final String groupField; + private final SentinelIntSet ordSet; + private final List groups; + + private FieldCache.StringIndex index; + + /** + * Expert: Constructs a {@link AllGroupsCollector} + * + * @param groupField The field to group by + * @param initialSize The initial size of the {@link SentinelIntSet}. The initial size should roughly match the total + * number of expected unique groups. Be aware that the heap usage is 4 bytes * initialSize. + */ + public AllGroupsCollector(String groupField, int initialSize) { + this.groupField = groupField; + ordSet = new SentinelIntSet(initialSize, -1); + groups = new ArrayList(initialSize); + } + + /** + * Constructs a {@link AllGroupsCollector}. This sets the initialSize for the {@link SentinelIntSet} to 128 in + * the {@link #AllGroupsCollector(String, int)} constructor + * + * @param groupField The field to group by + */ + public AllGroupsCollector(String groupField) { + this(groupField, DEFAULT_INITIAL_SIZE); + } + + public void setScorer(Scorer scorer) throws IOException { + } + + public void collect(int doc) throws IOException { + int key = index.order[doc]; + if (!ordSet.exists(key)) { + ordSet.put(key); + String term = key == 0 ? null : index.lookup[index.order[doc]]; + groups.add(term); + } + } + + /** + * Returns the total number of groups for the executed search. + * This is a convenience method. The following code snippet has the same effect:

getGroups().size()
+ * + * @return The total number of groups for the executed search + */ + public int getGroupCount() { + return groups.size(); + } + + /** + * Returns the group values + *

+ * This is an unordered collections of group values. For each group that matched the query there is a {@link String} + * representing a group value. + * + * @return the group values + */ + public Collection getGroups() { + return groups; + } + + public void setNextReader(IndexReader reader, int docBase) throws IOException { + index = FieldCache.DEFAULT.getStringIndex(reader, groupField); + + // Clear ordSet and fill it with previous encountered groups that can occur in the current segment. + ordSet.clear(); + for (String countedGroup : groups) { + int ord = index.binarySearchLookup(countedGroup); + if (ord >= 0) { + ordSet.put(ord); + } + } + } + + public boolean acceptsDocsOutOfOrder() { + return true; + } +} Index: lucene/contrib/grouping/src/java/org/apache/lucene/search/grouping/TopGroups.java =================================================================== --- lucene/contrib/grouping/src/java/org/apache/lucene/search/grouping/TopGroups.java (revision 1103038) +++ lucene/contrib/grouping/src/java/org/apache/lucene/search/grouping/TopGroups.java (revision ) @@ -21,9 +21,6 @@ /** Represents result returned by a grouping search. * - * Note that we do not return the total number of unique - * groups; doing so would be costly. - * * @lucene.experimental */ public class TopGroups { /** Number of documents matching the search */ @@ -32,6 +29,9 @@ /** Number of documents grouped into the topN groups */ public final int totalGroupedHitCount; + /** The total number of unique groups. If null this value is not computed. */ + public final Integer totalGroupCount; + /** Group results in groupSort order */ public final GroupDocs[] groups; @@ -47,5 +47,16 @@ this.totalHitCount = totalHitCount; this.totalGroupedHitCount = totalGroupedHitCount; this.groups = groups; + this.totalGroupCount = null; } + + public TopGroups(TopGroups oldTopGroups, Integer totalGroupCount) { + this.groupSort = oldTopGroups.groupSort; + this.withinGroupSort = oldTopGroups.withinGroupSort; + this.totalHitCount = oldTopGroups.totalHitCount; + this.totalGroupedHitCount = oldTopGroups.totalGroupedHitCount; + this.groups = oldTopGroups.groups; + this.totalGroupCount = totalGroupCount; -} + } + +} Index: lucene/contrib/grouping/src/test/org/apache/lucene/search/grouping/TestGrouping.java =================================================================== --- lucene/contrib/grouping/src/test/org/apache/lucene/search/grouping/TestGrouping.java (revision 1103038) +++ lucene/contrib/grouping/src/test/org/apache/lucene/search/grouping/TestGrouping.java (revision ) @@ -17,13 +17,7 @@ package org.apache.lucene.search.grouping; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collection; -import java.util.Collections; -import java.util.Comparator; -import java.util.HashMap; -import java.util.List; +import java.util.*; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.document.Document; @@ -32,14 +26,7 @@ import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.Term; -import org.apache.lucene.search.Collector; -import org.apache.lucene.search.FieldCache; -import org.apache.lucene.search.FieldDoc; -import org.apache.lucene.search.IndexSearcher; -import org.apache.lucene.search.ScoreDoc; -import org.apache.lucene.search.Sort; -import org.apache.lucene.search.SortField; -import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.*; import org.apache.lucene.store.Directory; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util._TestUtil; @@ -241,6 +228,7 @@ boolean fillFields, boolean getScores, boolean getMaxScores, + boolean doTotalGroupCount, Sort groupSort, Sort docSort, int topNGroups, @@ -256,6 +244,7 @@ final List sortedGroupFields = new ArrayList(); int totalHitCount = 0; + Set knownGroups = new HashSet(); for(GroupDoc d : groupDocs) { // TODO: would be better to filter by searchTerm before sorting! @@ -263,6 +252,12 @@ continue; } totalHitCount++; + if (doTotalGroupCount) { + if (!knownGroups.contains(d.group)) { + knownGroups.add(d.group); + } + } + List l = groups.get(d.group); if (l == null) { sortedGroups.add(d.group); @@ -315,8 +310,15 @@ fillFields ? sortedGroupFields.get(idx) : null); } + if (doTotalGroupCount) { + return new TopGroups( + new TopGroups(groupSort.getSort(), docSort.getSort(), totalHitCount, totalGroupedHitCount, result), + knownGroups.size() + ); + } else { - return new TopGroups(groupSort.getSort(), docSort.getSort(), totalHitCount, totalGroupedHitCount, result); - } + return new TopGroups(groupSort.getSort(), docSort.getSort(), totalHitCount, totalGroupedHitCount, result); + } + } public void testRandom() throws Exception { for(int iter=0;iter<3;iter++) { @@ -428,10 +430,18 @@ //final int docOffset = 0; final boolean doCache = random.nextBoolean(); + final boolean doTotalGroupCount = random.nextBoolean(); if (VERBOSE) { - System.out.println("TEST: groupSort=" + groupSort + " docSort=" + docSort + " searchTerm=" + searchTerm + " topNGroups=" + topNGroups + " groupOffset=" + groupOffset + " docOffset=" + docOffset + " doCache=" + doCache + " docsPerGroup=" + docsPerGroup); + System.out.println("TEST: groupSort=" + groupSort + " docSort=" + docSort + " searchTerm=" + searchTerm + " topNGroups=" + topNGroups + " groupOffset=" + groupOffset + " docOffset=" + docOffset + " doCache=" + doCache + " docsPerGroup=" + docsPerGroup + " doTotalGroupCount=" + doTotalGroupCount); } + final AllGroupsCollector groupCountCollector; + if (doTotalGroupCount) { + groupCountCollector = new AllGroupsCollector("group"); + } else { + groupCountCollector = null; + } + final FirstPassGroupingCollector c1 = new FirstPassGroupingCollector("group", groupSort, groupOffset+topNGroups); final CachingCollector cCache; final Collector c; @@ -440,7 +450,16 @@ if (VERBOSE) { System.out.println("TEST: maxCacheMB=" + maxCacheMB); } + + if (doTotalGroupCount) { + cCache = new CachingCollector(c1, true, maxCacheMB); + c = MultiCollector.wrap(cCache, groupCountCollector); + } else { - c = cCache = new CachingCollector(c1, true, maxCacheMB); + c = cCache = new CachingCollector(c1, true, maxCacheMB); + } + } else if (doTotalGroupCount) { + c = MultiCollector.wrap(c1, groupCountCollector); + cCache = null; } else { c = c1; cCache = null; @@ -476,7 +495,12 @@ s.search(new TermQuery(new Term("content", searchTerm)), c2); } + if (doTotalGroupCount) { + TopGroups tempTopGroups = c2.getTopGroups(docOffset); + groupsResult = new TopGroups(tempTopGroups, groupCountCollector.getGroupCount()); + } else { - groupsResult = c2.getTopGroups(docOffset); + groupsResult = c2.getTopGroups(docOffset); + } } else { groupsResult = null; if (VERBOSE) { @@ -484,7 +508,7 @@ } } - final TopGroups expectedGroups = slowGrouping(groupDocs, searchTerm, fillFields, getScores, getMaxScores, groupSort, docSort, topNGroups, docsPerGroup, groupOffset, docOffset); + final TopGroups expectedGroups = slowGrouping(groupDocs, searchTerm, fillFields, getScores, getMaxScores, doTotalGroupCount, groupSort, docSort, topNGroups, docsPerGroup, groupOffset, docOffset); try { // NOTE: intentional but temporary field cache insanity! @@ -509,6 +533,9 @@ assertEquals(expected.groups.length, actual.groups.length); assertEquals(expected.totalHitCount, actual.totalHitCount); assertEquals(expected.totalGroupedHitCount, actual.totalGroupedHitCount); + if (expected.totalGroupCount != null) { + assertEquals(expected.totalGroupCount, actual.totalGroupCount); + } for(int groupIDX=0;groupIDX