Index: modules/grouping/src/java/org/apache/lucene/search/grouping/TopGroups.java =================================================================== --- modules/grouping/src/java/org/apache/lucene/search/grouping/TopGroups.java (revision 1103024) +++ modules/grouping/src/java/org/apache/lucene/search/grouping/TopGroups.java (revision ) @@ -21,9 +21,6 @@ /** Represents result returned by a grouping search. * - * Note that we do not return the total number of unique - * groups; doing so would be costly. - * * @lucene.experimental */ public class TopGroups { /** Number of documents matching the search */ @@ -32,6 +29,9 @@ /** Number of documents grouped into the topN groups */ public final int totalGroupedHitCount; + /** The total number of unique groups. If null this value is not computed. */ + public final Integer totalGroupCount; + /** Group results in groupSort order */ public final GroupDocs[] groups; @@ -42,10 +42,15 @@ public final SortField[] withinGroupSort; public TopGroups(SortField[] groupSort, SortField[] withinGroupSort, int totalHitCount, int totalGroupedHitCount, GroupDocs[] groups) { + this(groupSort, withinGroupSort, totalHitCount, totalGroupedHitCount, groups, null); + } + + public TopGroups(SortField[] groupSort, SortField[] withinGroupSort, int totalHitCount, int totalGroupedHitCount, GroupDocs[] groups, Integer totalGroupCount) { this.groupSort = groupSort; this.withinGroupSort = withinGroupSort; this.totalHitCount = totalHitCount; this.totalGroupedHitCount = totalGroupedHitCount; this.groups = groups; + this.totalGroupCount = totalGroupCount; } } Index: modules/grouping/src/test/org/apache/lucene/search/grouping/TotalGroupCountCollectorTest.java =================================================================== --- modules/grouping/src/test/org/apache/lucene/search/grouping/TotalGroupCountCollectorTest.java (revision ) +++ modules/grouping/src/test/org/apache/lucene/search/grouping/TotalGroupCountCollectorTest.java (revision ) @@ -0,0 +1,109 @@ +package org.apache.lucene.search.grouping; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.RandomIndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.LuceneTestCase; + +public class TotalGroupCountCollectorTest extends LuceneTestCase { + + public void testTotalGroupCount() throws Exception { + + final String groupField = "author"; + + Directory dir = newDirectory(); + RandomIndexWriter w = new RandomIndexWriter( + random, + dir, + newIndexWriterConfig(TEST_VERSION_CURRENT, + new MockAnalyzer(random)).setMergePolicy(newLogMergePolicy())); + // 0 + Document doc = new Document(); + doc.add(new Field(groupField, "author1", Field.Store.YES, Field.Index.ANALYZED)); + doc.add(new Field("content", "random text", Field.Store.YES, Field.Index.ANALYZED)); + doc.add(new Field("id", "1", Field.Store.YES, Field.Index.NO)); + w.addDocument(doc); + + // 1 + doc = new Document(); + doc.add(new Field(groupField, "author1", Field.Store.YES, Field.Index.ANALYZED)); + doc.add(new Field("content", "some more random text blob", Field.Store.YES, Field.Index.ANALYZED)); + doc.add(new Field("id", "2", Field.Store.YES, Field.Index.NO)); + w.addDocument(doc); + + // 2 + doc = new Document(); + doc.add(new Field(groupField, "author1", Field.Store.YES, Field.Index.ANALYZED)); + doc.add(new Field("content", "some more random textual data", Field.Store.YES, Field.Index.ANALYZED)); + doc.add(new Field("id", "3", Field.Store.YES, Field.Index.NO)); + w.addDocument(doc); + w.commit(); // To ensure a second segment + + // 3 + doc = new Document(); + doc.add(new Field(groupField, "author2", Field.Store.YES, Field.Index.ANALYZED)); + doc.add(new Field("content", "some random text", Field.Store.YES, Field.Index.ANALYZED)); + doc.add(new Field("id", "4", Field.Store.YES, Field.Index.NO)); + w.addDocument(doc); + + // 4 + doc = new Document(); + doc.add(new Field(groupField, "author3", Field.Store.YES, Field.Index.ANALYZED)); + doc.add(new Field("content", "some more random text", Field.Store.YES, Field.Index.ANALYZED)); + doc.add(new Field("id", "5", Field.Store.YES, Field.Index.NO)); + w.addDocument(doc); + + // 5 + doc = new Document(); + doc.add(new Field(groupField, "author3", Field.Store.YES, Field.Index.ANALYZED)); + doc.add(new Field("content", "random blob", Field.Store.YES, Field.Index.ANALYZED)); + doc.add(new Field("id", "6", Field.Store.YES, Field.Index.NO)); + w.addDocument(doc); + + // 6 -- no author field + doc = new Document(); + doc.add(new Field("content", "random word stuck in alot of other text", Field.Store.YES, Field.Index.ANALYZED)); + doc.add(new Field("id", "6", Field.Store.YES, Field.Index.NO)); + w.addDocument(doc); + + IndexSearcher indexSearcher = new IndexSearcher(w.getReader()); + w.close(); + + TotalGroupCountCollector c1 = new TotalGroupCountCollector(groupField, 10); + indexSearcher.search(new TermQuery(new Term("content", "random")), c1); + assertEquals(4, c1.getGroupCount()); + + TotalGroupCountCollector c2 = new TotalGroupCountCollector(groupField, 10); + indexSearcher.search(new TermQuery(new Term("content", "some")), c2); + assertEquals(3, c2.getGroupCount()); + + TotalGroupCountCollector c3 = new TotalGroupCountCollector(groupField, 10); + indexSearcher.search(new TermQuery(new Term("content", "blob")), c3); + assertEquals(2, c3.getGroupCount()); + + indexSearcher.getIndexReader().close(); + dir.close(); + } +} Index: modules/grouping/src/test/org/apache/lucene/search/grouping/TestGrouping.java =================================================================== --- modules/grouping/src/test/org/apache/lucene/search/grouping/TestGrouping.java (revision 1103024) +++ modules/grouping/src/test/org/apache/lucene/search/grouping/TestGrouping.java (revision ) @@ -17,13 +17,7 @@ package org.apache.lucene.search.grouping; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collection; -import java.util.Collections; -import java.util.Comparator; -import java.util.HashMap; -import java.util.List; +import java.util.*; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.document.Document; @@ -32,14 +26,7 @@ import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.Term; -import org.apache.lucene.search.Collector; -import org.apache.lucene.search.FieldCache; -import org.apache.lucene.search.FieldDoc; -import org.apache.lucene.search.IndexSearcher; -import org.apache.lucene.search.ScoreDoc; -import org.apache.lucene.search.Sort; -import org.apache.lucene.search.SortField; -import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.*; import org.apache.lucene.store.Directory; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.LuceneTestCase; @@ -120,7 +107,7 @@ final SecondPassGroupingCollector c2 = new SecondPassGroupingCollector(groupField, c1.getTopGroups(0, true), groupSort, null, 5, true, false, true); indexSearcher.search(new TermQuery(new Term("content", "random")), c2); - + final TopGroups groups = c2.getTopGroups(0); assertEquals(7, groups.totalHitCount); @@ -242,6 +229,7 @@ boolean fillFields, boolean getScores, boolean getMaxScores, + boolean doTotalGroupCount, Sort groupSort, Sort docSort, int topNGroups, @@ -257,6 +245,8 @@ final List sortedGroupFields = new ArrayList(); int totalHitCount = 0; + int uniqueGroupCount = 0; + Set knownGroups = new HashSet(); for(GroupDoc d : groupDocs) { // TODO: would be better to filter by searchTerm before sorting! @@ -264,6 +254,14 @@ continue; } totalHitCount++; + + if (doTotalGroupCount) { + if (!knownGroups.contains(d.group)) { + uniqueGroupCount++; + knownGroups.add(d.group); + } + } + List l = groups.get(d.group); if (l == null) { sortedGroups.add(d.group); @@ -316,8 +314,12 @@ fillFields ? sortedGroupFields.get(idx) : null); } + if (doTotalGroupCount) { + return new TopGroups(groupSort.getSort(), docSort.getSort(), totalHitCount, totalGroupedHitCount, result, uniqueGroupCount); + } else { - return new TopGroups(groupSort.getSort(), docSort.getSort(), totalHitCount, totalGroupedHitCount, result); - } + return new TopGroups(groupSort.getSort(), docSort.getSort(), totalHitCount, totalGroupedHitCount, result); + } + } public void testRandom() throws Exception { for(int iter=0;iter<3;iter++) { @@ -334,7 +336,7 @@ if (VERBOSE) { System.out.println("TEST: numDocs=" + numDocs + " numGroups=" + numGroups); } - + final List groups = new ArrayList(); for(int i=0;i countedGroups = new LinkedList(); + private final BytesRef spareBytesRef = new BytesRef(); + + private int groupCount; + private FieldCache.DocTermsIndex index; + + /** + * Constructs a {@link TotalGroupCountCollector} + * + * @param groupField The field to group by + * @param initialSize The initial size of the {@link SentinelIntSet} + */ + public TotalGroupCountCollector(String groupField, int initialSize) { + this.groupField = groupField; + ordSet = new SentinelIntSet(initialSize, -1); + } + + public void setScorer(Scorer scorer) throws IOException { + } + + public void collect(int doc) throws IOException { + int key = index.getOrd(doc); + if (!ordSet.exists(key)) { + groupCount++; + ordSet.put(key); + BytesRef term = key == 0 ? null : index.getTerm(doc, new BytesRef()); + countedGroups.add(term); + } + } + + /** + * @return The total number of groups for the executed search + */ + public int getGroupCount() { + return groupCount; + } + + public void setNextReader(IndexReader.AtomicReaderContext context) throws IOException { + index = FieldCache.DEFAULT.getTermsIndex(context.reader, groupField); + + // Clear ordSet and fill it with previous encountered groups that can occur in the current segment. + ordSet.clear(); + for (BytesRef countedGroup : countedGroups) { + int ord = index.binarySearchLookup(countedGroup, spareBytesRef); +// System.out.println(String.format("ord: %d| value: %s", ord, countedGroup == null ? null : countedGroup.utf8ToString())); + if (ord >= 0) { + ordSet.put(ord); + } + } + } + + public boolean acceptsDocsOutOfOrder() { + return true; + } +} \ No newline at end of file