Index: lucene/contrib/grouping/src/test/org/apache/lucene/search/grouping/TotalGroupCountCollectorTest.java
===================================================================
--- lucene/contrib/grouping/src/test/org/apache/lucene/search/grouping/TotalGroupCountCollectorTest.java (revision )
+++ lucene/contrib/grouping/src/test/org/apache/lucene/search/grouping/TotalGroupCountCollectorTest.java (revision )
@@ -0,0 +1,109 @@
+package org.apache.lucene.search.grouping;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.MockAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.index.RandomIndexWriter;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.LuceneTestCase;
+
+public class TotalGroupCountCollectorTest extends LuceneTestCase {
+
+ public void testTotalGroupCount() throws Exception {
+
+ final String groupField = "author";
+
+ Directory dir = newDirectory();
+ RandomIndexWriter w = new RandomIndexWriter(
+ random,
+ dir,
+ newIndexWriterConfig(TEST_VERSION_CURRENT,
+ new MockAnalyzer(random)).setMergePolicy(newLogMergePolicy()));
+ // 0
+ Document doc = new Document();
+ doc.add(new Field(groupField, "author1", Field.Store.YES, Field.Index.ANALYZED));
+ doc.add(new Field("content", "random text", Field.Store.YES, Field.Index.ANALYZED));
+ doc.add(new Field("id", "1", Field.Store.YES, Field.Index.NO));
+ w.addDocument(doc);
+
+ // 1
+ doc = new Document();
+ doc.add(new Field(groupField, "author1", Field.Store.YES, Field.Index.ANALYZED));
+ doc.add(new Field("content", "some more random text blob", Field.Store.YES, Field.Index.ANALYZED));
+ doc.add(new Field("id", "2", Field.Store.YES, Field.Index.NO));
+ w.addDocument(doc);
+
+ // 2
+ doc = new Document();
+ doc.add(new Field(groupField, "author1", Field.Store.YES, Field.Index.ANALYZED));
+ doc.add(new Field("content", "some more random textual data", Field.Store.YES, Field.Index.ANALYZED));
+ doc.add(new Field("id", "3", Field.Store.YES, Field.Index.NO));
+ w.addDocument(doc);
+ w.commit(); // To ensure a second segment
+
+ // 3
+ doc = new Document();
+ doc.add(new Field(groupField, "author2", Field.Store.YES, Field.Index.ANALYZED));
+ doc.add(new Field("content", "some random text", Field.Store.YES, Field.Index.ANALYZED));
+ doc.add(new Field("id", "4", Field.Store.YES, Field.Index.NO));
+ w.addDocument(doc);
+
+ // 4
+ doc = new Document();
+ doc.add(new Field(groupField, "author3", Field.Store.YES, Field.Index.ANALYZED));
+ doc.add(new Field("content", "some more random text", Field.Store.YES, Field.Index.ANALYZED));
+ doc.add(new Field("id", "5", Field.Store.YES, Field.Index.NO));
+ w.addDocument(doc);
+
+ // 5
+ doc = new Document();
+ doc.add(new Field(groupField, "author3", Field.Store.YES, Field.Index.ANALYZED));
+ doc.add(new Field("content", "random blob", Field.Store.YES, Field.Index.ANALYZED));
+ doc.add(new Field("id", "6", Field.Store.YES, Field.Index.NO));
+ w.addDocument(doc);
+
+ // 6 -- no author field
+ doc = new Document();
+ doc.add(new Field("content", "random word stuck in alot of other text", Field.Store.YES, Field.Index.ANALYZED));
+ doc.add(new Field("id", "6", Field.Store.YES, Field.Index.NO));
+ w.addDocument(doc);
+
+ IndexSearcher indexSearcher = new IndexSearcher(w.getReader());
+ w.close();
+
+ TotalGroupCountCollector c1 = new TotalGroupCountCollector(groupField);
+ indexSearcher.search(new TermQuery(new Term("content", "random")), c1);
+ assertEquals(4, c1.getGroupCount());
+
+ TotalGroupCountCollector c2 = new TotalGroupCountCollector(groupField);
+ indexSearcher.search(new TermQuery(new Term("content", "some")), c2);
+ assertEquals(3, c2.getGroupCount());
+
+ TotalGroupCountCollector c3 = new TotalGroupCountCollector(groupField);
+ indexSearcher.search(new TermQuery(new Term("content", "blob")), c3);
+ assertEquals(2, c3.getGroupCount());
+
+ indexSearcher.getIndexReader().close();
+ dir.close();
+ }
+}
Index: lucene/contrib/grouping/src/java/org/apache/lucene/search/grouping/TotalGroupCountCollector.java
===================================================================
--- lucene/contrib/grouping/src/java/org/apache/lucene/search/grouping/TotalGroupCountCollector.java (revision )
+++ lucene/contrib/grouping/src/java/org/apache/lucene/search/grouping/TotalGroupCountCollector.java (revision )
@@ -0,0 +1,107 @@
+package org.apache.lucene.search.grouping;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.search.Collector;
+import org.apache.lucene.search.FieldCache;
+import org.apache.lucene.search.Scorer;
+
+import java.io.IOException;
+import java.util.LinkedList;
+import java.util.List;
+
+/**
+ * A collector that counts the total number of groups.
+ *
+ * Internally the {@link SentinelIntSet} is responsible for detecting if a group is already added to the total count.
+ * For each segment the {@link SentinelIntSet} is cleared and filled with previous counted groups that occur in
+ * the new segment.
+ *
+ * @lucene.experimental
+ */
+public class TotalGroupCountCollector extends Collector {
+
+ private static final int DEFAULT_INITIAL_SIZE = 128;
+
+ private final String groupField;
+ private final SentinelIntSet ordSet;
+ private final List countedGroups = new LinkedList();
+
+ private int groupCount;
+ private FieldCache.StringIndex index;
+
+ /**
+ * Expert: Constructs a {@link TotalGroupCountCollector}
+ *
+ * @param groupField The field to group by
+ * @param initialSize The initial size of the {@link SentinelIntSet}. The initial size should roughly match the total
+ * number of expected unique groups. Be aware that the heap usage is 4 bytes * initialSize.
+ */
+ public TotalGroupCountCollector(String groupField, int initialSize) {
+ this.groupField = groupField;
+ ordSet = new SentinelIntSet(initialSize, -1);
+ }
+
+ /**
+ * Constructs a {@link TotalGroupCountCollector}. This sets the initialSize for the {@link SentinelIntSet} to 128 in
+ * the {@link #TotalGroupCountCollector(String, int)} constructor
+ *
+ * @param groupField The field to group by
+ */
+ public TotalGroupCountCollector(String groupField) {
+ this(groupField, DEFAULT_INITIAL_SIZE);
+ }
+
+ public void setScorer(Scorer scorer) throws IOException {
+ }
+
+ public void collect(int doc) throws IOException {
+ int key = index.order[doc];
+ if (!ordSet.exists(key)) {
+ groupCount++;
+ ordSet.put(key);
+ String term = key == 0 ? null : index.lookup[index.order[doc]];
+ countedGroups.add(term);
+ }
+ }
+
+ /**
+ * @return The total number of groups for the executed search
+ */
+ public int getGroupCount() {
+ return groupCount;
+ }
+
+ public void setNextReader(IndexReader reader, int docBase) throws IOException {
+ index = FieldCache.DEFAULT.getStringIndex(reader, groupField);
+
+ // Clear ordSet and fill it with previous encountered groups that can occur in the current segment.
+ ordSet.clear();
+ for (String countedGroup : countedGroups) {
+ int ord = index.binarySearchLookup(countedGroup);
+ if (ord >= 0) {
+ ordSet.put(ord);
+ }
+ }
+ }
+
+ public boolean acceptsDocsOutOfOrder() {
+ return true;
+ }
+}
Index: lucene/contrib/grouping/src/test/org/apache/lucene/search/grouping/TestGrouping.java
===================================================================
--- lucene/contrib/grouping/src/test/org/apache/lucene/search/grouping/TestGrouping.java (revision 1103038)
+++ lucene/contrib/grouping/src/test/org/apache/lucene/search/grouping/TestGrouping.java (revision )
@@ -17,13 +17,7 @@
package org.apache.lucene.search.grouping;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.Comparator;
-import java.util.HashMap;
-import java.util.List;
+import java.util.*;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.Document;
@@ -32,14 +26,7 @@
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
-import org.apache.lucene.search.Collector;
-import org.apache.lucene.search.FieldCache;
-import org.apache.lucene.search.FieldDoc;
-import org.apache.lucene.search.IndexSearcher;
-import org.apache.lucene.search.ScoreDoc;
-import org.apache.lucene.search.Sort;
-import org.apache.lucene.search.SortField;
-import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.*;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util._TestUtil;
@@ -241,6 +228,7 @@
boolean fillFields,
boolean getScores,
boolean getMaxScores,
+ boolean doTotalGroupCount,
Sort groupSort,
Sort docSort,
int topNGroups,
@@ -256,6 +244,7 @@
final List sortedGroupFields = new ArrayList();
int totalHitCount = 0;
+ Set knownGroups = new HashSet();
for(GroupDoc d : groupDocs) {
// TODO: would be better to filter by searchTerm before sorting!
@@ -263,6 +252,12 @@
continue;
}
totalHitCount++;
+ if (doTotalGroupCount) {
+ if (!knownGroups.contains(d.group)) {
+ knownGroups.add(d.group);
+ }
+ }
+
List l = groups.get(d.group);
if (l == null) {
sortedGroups.add(d.group);
@@ -315,8 +310,12 @@
fillFields ? sortedGroupFields.get(idx) : null);
}
- return new TopGroups(groupSort.getSort(), docSort.getSort(), totalHitCount, totalGroupedHitCount, result);
+ if (doTotalGroupCount) {
+ return new TopGroups(groupSort.getSort(), docSort.getSort(), totalHitCount, totalGroupedHitCount, result, knownGroups.size());
+ } else {
+ return new TopGroups(groupSort.getSort(), docSort.getSort(), totalHitCount, totalGroupedHitCount, result, null);
- }
+ }
+ }
public void testRandom() throws Exception {
for(int iter=0;iter<3;iter++) {
@@ -428,10 +427,18 @@
//final int docOffset = 0;
final boolean doCache = random.nextBoolean();
+ final boolean doTotalGroupCount = random.nextBoolean();
if (VERBOSE) {
- System.out.println("TEST: groupSort=" + groupSort + " docSort=" + docSort + " searchTerm=" + searchTerm + " topNGroups=" + topNGroups + " groupOffset=" + groupOffset + " docOffset=" + docOffset + " doCache=" + doCache + " docsPerGroup=" + docsPerGroup);
+ System.out.println("TEST: groupSort=" + groupSort + " docSort=" + docSort + " searchTerm=" + searchTerm + " topNGroups=" + topNGroups + " groupOffset=" + groupOffset + " docOffset=" + docOffset + " doCache=" + doCache + " docsPerGroup=" + docsPerGroup + " doTotalGroupCount=" + doTotalGroupCount);
}
+ final TotalGroupCountCollector groupCountCollector;
+ if (doTotalGroupCount) {
+ groupCountCollector = new TotalGroupCountCollector("group");
+ } else {
+ groupCountCollector = null;
+ }
+
final FirstPassGroupingCollector c1 = new FirstPassGroupingCollector("group", groupSort, groupOffset+topNGroups);
final CachingCollector cCache;
final Collector c;
@@ -440,7 +447,16 @@
if (VERBOSE) {
System.out.println("TEST: maxCacheMB=" + maxCacheMB);
}
+
+ if (doTotalGroupCount) {
+ cCache = new CachingCollector(c1, true, maxCacheMB);
+ c = MultiCollector.wrap(cCache, groupCountCollector);
+ } else {
- c = cCache = new CachingCollector(c1, true, maxCacheMB);
+ c = cCache = new CachingCollector(c1, true, maxCacheMB);
+ }
+ } else if (doTotalGroupCount) {
+ c = MultiCollector.wrap(c1, groupCountCollector);
+ cCache = null;
} else {
c = c1;
cCache = null;
@@ -476,7 +492,12 @@
s.search(new TermQuery(new Term("content", searchTerm)), c2);
}
+ if (doTotalGroupCount) {
+ TopGroups tempTopGroups = c2.getTopGroups(docOffset);
+ groupsResult = new TopGroups(tempTopGroups.groupSort, tempTopGroups.withinGroupSort, tempTopGroups.totalHitCount, tempTopGroups.totalGroupedHitCount, tempTopGroups.groups, groupCountCollector.getGroupCount());
+ } else {
- groupsResult = c2.getTopGroups(docOffset);
+ groupsResult = c2.getTopGroups(docOffset);
+ }
} else {
groupsResult = null;
if (VERBOSE) {
@@ -484,7 +505,7 @@
}
}
- final TopGroups expectedGroups = slowGrouping(groupDocs, searchTerm, fillFields, getScores, getMaxScores, groupSort, docSort, topNGroups, docsPerGroup, groupOffset, docOffset);
+ final TopGroups expectedGroups = slowGrouping(groupDocs, searchTerm, fillFields, getScores, getMaxScores, doTotalGroupCount, groupSort, docSort, topNGroups, docsPerGroup, groupOffset, docOffset);
try {
// NOTE: intentional but temporary field cache insanity!
@@ -509,6 +530,9 @@
assertEquals(expected.groups.length, actual.groups.length);
assertEquals(expected.totalHitCount, actual.totalHitCount);
assertEquals(expected.totalGroupedHitCount, actual.totalGroupedHitCount);
+ if (expected.totalGroupCount != null) {
+ assertEquals(expected.totalGroupCount, actual.totalGroupCount);
+ }
for(int groupIDX=0;groupIDXnull this value is not computed. */
+ public final Integer totalGroupCount;
+
/** Group results in groupSort order */
public final GroupDocs[] groups;
@@ -41,11 +41,12 @@
/** How docs are sorted within each group */
public final SortField[] withinGroupSort;
- public TopGroups(SortField[] groupSort, SortField[] withinGroupSort, int totalHitCount, int totalGroupedHitCount, GroupDocs[] groups) {
+ public TopGroups(SortField[] groupSort, SortField[] withinGroupSort, int totalHitCount, int totalGroupedHitCount, GroupDocs[] groups, Integer totalGroupCount) {
this.groupSort = groupSort;
this.withinGroupSort = withinGroupSort;
this.totalHitCount = totalHitCount;
this.totalGroupedHitCount = totalGroupedHitCount;
this.groups = groups;
+ this.totalGroupCount = totalGroupCount;
}
}