Index: modules/grouping/src/java/org/apache/lucene/search/grouping/SecondPassGroupingCollector.java
===================================================================
--- modules/grouping/src/java/org/apache/lucene/search/grouping/SecondPassGroupingCollector.java (revision 1103150)
+++ modules/grouping/src/java/org/apache/lucene/search/grouping/SecondPassGroupingCollector.java (revision )
@@ -153,7 +153,7 @@
return new TopGroups(groupSort.getSort(),
withinGroupSort == null ? null : withinGroupSort.getSort(),
- totalHitCount, totalGroupedHitCount, groupDocsResult);
+ totalHitCount, totalGroupedHitCount, groupDocsResult, null);
}
}
Index: modules/grouping/src/java/org/apache/lucene/search/grouping/TopGroups.java
===================================================================
--- modules/grouping/src/java/org/apache/lucene/search/grouping/TopGroups.java (revision 1103024)
+++ modules/grouping/src/java/org/apache/lucene/search/grouping/TopGroups.java (revision )
@@ -21,9 +21,6 @@
/** Represents result returned by a grouping search.
*
- * Note that we do not return the total number of unique
- * groups; doing so would be costly.
- *
* @lucene.experimental */
public class TopGroups {
/** Number of documents matching the search */
@@ -32,6 +29,9 @@
/** Number of documents grouped into the topN groups */
public final int totalGroupedHitCount;
+ /** The total number of unique groups. If null this value is not computed. */
+ public final Integer totalGroupCount;
+
/** Group results in groupSort order */
public final GroupDocs[] groups;
@@ -41,11 +41,12 @@
/** How docs are sorted within each group */
public final SortField[] withinGroupSort;
- public TopGroups(SortField[] groupSort, SortField[] withinGroupSort, int totalHitCount, int totalGroupedHitCount, GroupDocs[] groups) {
+ public TopGroups(SortField[] groupSort, SortField[] withinGroupSort, int totalHitCount, int totalGroupedHitCount, GroupDocs[] groups, Integer totalGroupCount) {
this.groupSort = groupSort;
this.withinGroupSort = withinGroupSort;
this.totalHitCount = totalHitCount;
this.totalGroupedHitCount = totalGroupedHitCount;
this.groups = groups;
+ this.totalGroupCount = totalGroupCount;
}
}
Index: modules/grouping/src/test/org/apache/lucene/search/grouping/TotalGroupCountCollectorTest.java
===================================================================
--- modules/grouping/src/test/org/apache/lucene/search/grouping/TotalGroupCountCollectorTest.java (revision )
+++ modules/grouping/src/test/org/apache/lucene/search/grouping/TotalGroupCountCollectorTest.java (revision )
@@ -0,0 +1,109 @@
+package org.apache.lucene.search.grouping;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.MockAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.index.RandomIndexWriter;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.LuceneTestCase;
+
+public class TotalGroupCountCollectorTest extends LuceneTestCase {
+
+ public void testTotalGroupCount() throws Exception {
+
+ final String groupField = "author";
+
+ Directory dir = newDirectory();
+ RandomIndexWriter w = new RandomIndexWriter(
+ random,
+ dir,
+ newIndexWriterConfig(TEST_VERSION_CURRENT,
+ new MockAnalyzer(random)).setMergePolicy(newLogMergePolicy()));
+ // 0
+ Document doc = new Document();
+ doc.add(new Field(groupField, "author1", Field.Store.YES, Field.Index.ANALYZED));
+ doc.add(new Field("content", "random text", Field.Store.YES, Field.Index.ANALYZED));
+ doc.add(new Field("id", "1", Field.Store.YES, Field.Index.NO));
+ w.addDocument(doc);
+
+ // 1
+ doc = new Document();
+ doc.add(new Field(groupField, "author1", Field.Store.YES, Field.Index.ANALYZED));
+ doc.add(new Field("content", "some more random text blob", Field.Store.YES, Field.Index.ANALYZED));
+ doc.add(new Field("id", "2", Field.Store.YES, Field.Index.NO));
+ w.addDocument(doc);
+
+ // 2
+ doc = new Document();
+ doc.add(new Field(groupField, "author1", Field.Store.YES, Field.Index.ANALYZED));
+ doc.add(new Field("content", "some more random textual data", Field.Store.YES, Field.Index.ANALYZED));
+ doc.add(new Field("id", "3", Field.Store.YES, Field.Index.NO));
+ w.addDocument(doc);
+ w.commit(); // To ensure a second segment
+
+ // 3
+ doc = new Document();
+ doc.add(new Field(groupField, "author2", Field.Store.YES, Field.Index.ANALYZED));
+ doc.add(new Field("content", "some random text", Field.Store.YES, Field.Index.ANALYZED));
+ doc.add(new Field("id", "4", Field.Store.YES, Field.Index.NO));
+ w.addDocument(doc);
+
+ // 4
+ doc = new Document();
+ doc.add(new Field(groupField, "author3", Field.Store.YES, Field.Index.ANALYZED));
+ doc.add(new Field("content", "some more random text", Field.Store.YES, Field.Index.ANALYZED));
+ doc.add(new Field("id", "5", Field.Store.YES, Field.Index.NO));
+ w.addDocument(doc);
+
+ // 5
+ doc = new Document();
+ doc.add(new Field(groupField, "author3", Field.Store.YES, Field.Index.ANALYZED));
+ doc.add(new Field("content", "random blob", Field.Store.YES, Field.Index.ANALYZED));
+ doc.add(new Field("id", "6", Field.Store.YES, Field.Index.NO));
+ w.addDocument(doc);
+
+ // 6 -- no author field
+ doc = new Document();
+ doc.add(new Field("content", "random word stuck in alot of other text", Field.Store.YES, Field.Index.ANALYZED));
+ doc.add(new Field("id", "6", Field.Store.YES, Field.Index.NO));
+ w.addDocument(doc);
+
+ IndexSearcher indexSearcher = new IndexSearcher(w.getReader());
+ w.close();
+
+ TotalGroupCountCollector c1 = new TotalGroupCountCollector(groupField);
+ indexSearcher.search(new TermQuery(new Term("content", "random")), c1);
+ assertEquals(4, c1.getGroupCount());
+
+ TotalGroupCountCollector c2 = new TotalGroupCountCollector(groupField);
+ indexSearcher.search(new TermQuery(new Term("content", "some")), c2);
+ assertEquals(3, c2.getGroupCount());
+
+ TotalGroupCountCollector c3 = new TotalGroupCountCollector(groupField);
+ indexSearcher.search(new TermQuery(new Term("content", "blob")), c3);
+ assertEquals(2, c3.getGroupCount());
+
+ indexSearcher.getIndexReader().close();
+ dir.close();
+ }
+}
Index: modules/grouping/src/test/org/apache/lucene/search/grouping/TestGrouping.java
===================================================================
--- modules/grouping/src/test/org/apache/lucene/search/grouping/TestGrouping.java (revision 1103024)
+++ modules/grouping/src/test/org/apache/lucene/search/grouping/TestGrouping.java (revision )
@@ -17,13 +17,7 @@
package org.apache.lucene.search.grouping;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.Comparator;
-import java.util.HashMap;
-import java.util.List;
+import java.util.*;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.Document;
@@ -32,14 +26,7 @@
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
-import org.apache.lucene.search.Collector;
-import org.apache.lucene.search.FieldCache;
-import org.apache.lucene.search.FieldDoc;
-import org.apache.lucene.search.IndexSearcher;
-import org.apache.lucene.search.ScoreDoc;
-import org.apache.lucene.search.Sort;
-import org.apache.lucene.search.SortField;
-import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.*;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
@@ -120,7 +107,7 @@
final SecondPassGroupingCollector c2 = new SecondPassGroupingCollector(groupField, c1.getTopGroups(0, true), groupSort, null, 5, true, false, true);
indexSearcher.search(new TermQuery(new Term("content", "random")), c2);
-
+
final TopGroups groups = c2.getTopGroups(0);
assertEquals(7, groups.totalHitCount);
@@ -242,6 +229,7 @@
boolean fillFields,
boolean getScores,
boolean getMaxScores,
+ boolean doTotalGroupCount,
Sort groupSort,
Sort docSort,
int topNGroups,
@@ -257,6 +245,7 @@
final List sortedGroupFields = new ArrayList();
int totalHitCount = 0;
+ Set knownGroups = new HashSet();
for(GroupDoc d : groupDocs) {
// TODO: would be better to filter by searchTerm before sorting!
@@ -264,6 +253,13 @@
continue;
}
totalHitCount++;
+
+ if (doTotalGroupCount) {
+ if (!knownGroups.contains(d.group)) {
+ knownGroups.add(d.group);
+ }
+ }
+
List l = groups.get(d.group);
if (l == null) {
sortedGroups.add(d.group);
@@ -316,8 +312,12 @@
fillFields ? sortedGroupFields.get(idx) : null);
}
- return new TopGroups(groupSort.getSort(), docSort.getSort(), totalHitCount, totalGroupedHitCount, result);
+ if (doTotalGroupCount) {
+ return new TopGroups(groupSort.getSort(), docSort.getSort(), totalHitCount, totalGroupedHitCount, result, knownGroups.size());
+ } else {
+ return new TopGroups(groupSort.getSort(), docSort.getSort(), totalHitCount, totalGroupedHitCount, result, null);
- }
+ }
+ }
public void testRandom() throws Exception {
for(int iter=0;iter<3;iter++) {
@@ -334,7 +334,7 @@
if (VERBOSE) {
System.out.println("TEST: numDocs=" + numDocs + " numGroups=" + numGroups);
}
-
+
final List groups = new ArrayList();
for(int i=0;i
+ * Internally the {@link SentinelIntSet} is responsible for detecting if a group is already added to the total count.
+ * For each segment the {@link SentinelIntSet} is cleared and filled with previous counted groups that occur in
+ * the new segment.
+ *
+ * @lucene.experimental
+ */
+public class TotalGroupCountCollector extends Collector {
+
+ private static final int DEFAULT_INITIAL_SIZE = 128;
+
+ private final String groupField;
+ private final SentinelIntSet ordSet;
+ private final List countedGroups = new LinkedList();
+ private final BytesRef spareBytesRef = new BytesRef();
+
+ private int groupCount;
+ private FieldCache.DocTermsIndex index;
+
+ /**
+ * Expert: Constructs a {@link TotalGroupCountCollector}
+ *
+ * @param groupField The field to group by
+ * @param initialSize The initial size of the {@link SentinelIntSet}. The initial size should roughly match the total
+ * number of expected unique groups. Be aware that the heap usage is 4 bytes * initialSize.
+ */
+ public TotalGroupCountCollector(String groupField, int initialSize) {
+ this.groupField = groupField;
+ ordSet = new SentinelIntSet(initialSize, -1);
+ }
+
+ /**
+ * Constructs a {@link TotalGroupCountCollector}. This sets the initialSize for the {@link SentinelIntSet} to 128 in
+ * the {@link #TotalGroupCountCollector(String, int)} constructor
+ *
+ * @param groupField The field to group by
+ */
+ public TotalGroupCountCollector(String groupField) {
+ this(groupField, DEFAULT_INITIAL_SIZE);
+ }
+
+ public void setScorer(Scorer scorer) throws IOException {
+ }
+
+ public void collect(int doc) throws IOException {
+ int key = index.getOrd(doc);
+ if (!ordSet.exists(key)) {
+ groupCount++;
+ ordSet.put(key);
+ BytesRef term = key == 0 ? null : index.getTerm(doc, new BytesRef());
+ countedGroups.add(term);
+ }
+ }
+
+ /**
+ * @return The total number of groups for the executed search
+ */
+ public int getGroupCount() {
+ return groupCount;
+ }
+
+ public void setNextReader(IndexReader.AtomicReaderContext context) throws IOException {
+ index = FieldCache.DEFAULT.getTermsIndex(context.reader, groupField);
+
+ // Clear ordSet and fill it with previous encountered groups that can occur in the current segment.
+ ordSet.clear();
+ for (BytesRef countedGroup : countedGroups) {
+ int ord = index.binarySearchLookup(countedGroup, spareBytesRef);
+ if (ord >= 0) {
+ ordSet.put(ord);
+ }
+ }
+ }
+
+ public boolean acceptsDocsOutOfOrder() {
+ return true;
+ }
+}
\ No newline at end of file