Index: lucene/contrib/grouping/src/test/org/apache/lucene/search/grouping/TotalGroupCountCollectorTest.java =================================================================== --- lucene/contrib/grouping/src/test/org/apache/lucene/search/grouping/TotalGroupCountCollectorTest.java (revision ) +++ lucene/contrib/grouping/src/test/org/apache/lucene/search/grouping/TotalGroupCountCollectorTest.java (revision ) @@ -0,0 +1,109 @@ +package org.apache.lucene.search.grouping; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.RandomIndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.LuceneTestCase; + +public class TotalGroupCountCollectorTest extends LuceneTestCase { + + public void testTotalGroupCount() throws Exception { + + final String groupField = "author"; + + Directory dir = newDirectory(); + RandomIndexWriter w = new RandomIndexWriter( + random, + dir, + newIndexWriterConfig(TEST_VERSION_CURRENT, + new MockAnalyzer(random)).setMergePolicy(newLogMergePolicy())); + // 0 + Document doc = new Document(); + doc.add(new Field(groupField, "author1", Field.Store.YES, Field.Index.ANALYZED)); + doc.add(new Field("content", "random text", Field.Store.YES, Field.Index.ANALYZED)); + doc.add(new Field("id", "1", Field.Store.YES, Field.Index.NO)); + w.addDocument(doc); + + // 1 + doc = new Document(); + doc.add(new Field(groupField, "author1", Field.Store.YES, Field.Index.ANALYZED)); + doc.add(new Field("content", "some more random text blob", Field.Store.YES, Field.Index.ANALYZED)); + doc.add(new Field("id", "2", Field.Store.YES, Field.Index.NO)); + w.addDocument(doc); + + // 2 + doc = new Document(); + doc.add(new Field(groupField, "author1", Field.Store.YES, Field.Index.ANALYZED)); + doc.add(new Field("content", "some more random textual data", Field.Store.YES, Field.Index.ANALYZED)); + doc.add(new Field("id", "3", Field.Store.YES, Field.Index.NO)); + w.addDocument(doc); + w.commit(); // To ensure a second segment + + // 3 + doc = new Document(); + doc.add(new Field(groupField, "author2", Field.Store.YES, Field.Index.ANALYZED)); + doc.add(new Field("content", "some random text", Field.Store.YES, Field.Index.ANALYZED)); + doc.add(new Field("id", "4", Field.Store.YES, Field.Index.NO)); + w.addDocument(doc); + + // 4 + doc = new Document(); + doc.add(new Field(groupField, "author3", Field.Store.YES, Field.Index.ANALYZED)); + doc.add(new Field("content", "some more random text", Field.Store.YES, Field.Index.ANALYZED)); + doc.add(new Field("id", "5", Field.Store.YES, Field.Index.NO)); + w.addDocument(doc); + + // 5 + doc = new Document(); + doc.add(new Field(groupField, "author3", Field.Store.YES, Field.Index.ANALYZED)); + doc.add(new Field("content", "random blob", Field.Store.YES, Field.Index.ANALYZED)); + doc.add(new Field("id", "6", Field.Store.YES, Field.Index.NO)); + w.addDocument(doc); + + // 6 -- no author field + doc = new Document(); + doc.add(new Field("content", "random word stuck in alot of other text", Field.Store.YES, Field.Index.ANALYZED)); + doc.add(new Field("id", "6", Field.Store.YES, Field.Index.NO)); + w.addDocument(doc); + + IndexSearcher indexSearcher = new IndexSearcher(w.getReader()); + w.close(); + + TotalGroupCountCollector c1 = new TotalGroupCountCollector(groupField); + indexSearcher.search(new TermQuery(new Term("content", "random")), c1); + assertEquals(4, c1.getGroupCount()); + + TotalGroupCountCollector c2 = new TotalGroupCountCollector(groupField); + indexSearcher.search(new TermQuery(new Term("content", "some")), c2); + assertEquals(3, c2.getGroupCount()); + + TotalGroupCountCollector c3 = new TotalGroupCountCollector(groupField); + indexSearcher.search(new TermQuery(new Term("content", "blob")), c3); + assertEquals(2, c3.getGroupCount()); + + indexSearcher.getIndexReader().close(); + dir.close(); + } +} Index: lucene/contrib/grouping/src/java/org/apache/lucene/search/grouping/TotalGroupCountCollector.java =================================================================== --- lucene/contrib/grouping/src/java/org/apache/lucene/search/grouping/TotalGroupCountCollector.java (revision ) +++ lucene/contrib/grouping/src/java/org/apache/lucene/search/grouping/TotalGroupCountCollector.java (revision ) @@ -0,0 +1,107 @@ +package org.apache.lucene.search.grouping; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.search.Collector; +import org.apache.lucene.search.FieldCache; +import org.apache.lucene.search.Scorer; + +import java.io.IOException; +import java.util.LinkedList; +import java.util.List; + +/** + * A collector that counts the total number of groups. + *

+ * Internally the {@link SentinelIntSet} is responsible for detecting if a group is already added to the total count. + * For each segment the {@link SentinelIntSet} is cleared and filled with previous counted groups that occur in + * the new segment. + * + * @lucene.experimental + */ +public class TotalGroupCountCollector extends Collector { + + private static final int DEFAULT_INITIAL_SIZE = 128; + + private final String groupField; + private final SentinelIntSet ordSet; + private final List countedGroups = new LinkedList(); + + private int groupCount; + private FieldCache.StringIndex index; + + /** + * Expert: Constructs a {@link TotalGroupCountCollector} + * + * @param groupField The field to group by + * @param initialSize The initial size of the {@link SentinelIntSet}. The initial size should roughly match the total + * number of expected unique groups. Be aware that the heap usage is 4 bytes * initialSize. + */ + public TotalGroupCountCollector(String groupField, int initialSize) { + this.groupField = groupField; + ordSet = new SentinelIntSet(initialSize, -1); + } + + /** + * Constructs a {@link TotalGroupCountCollector}. This sets the initialSize for the {@link SentinelIntSet} to 128 in + * the {@link #TotalGroupCountCollector(String, int)} constructor + * + * @param groupField The field to group by + */ + public TotalGroupCountCollector(String groupField) { + this(groupField, DEFAULT_INITIAL_SIZE); + } + + public void setScorer(Scorer scorer) throws IOException { + } + + public void collect(int doc) throws IOException { + int key = index.order[doc]; + if (!ordSet.exists(key)) { + groupCount++; + ordSet.put(key); + String term = key == 0 ? null : index.lookup[index.order[doc]]; + countedGroups.add(term); + } + } + + /** + * @return The total number of groups for the executed search + */ + public int getGroupCount() { + return groupCount; + } + + public void setNextReader(IndexReader reader, int docBase) throws IOException { + index = FieldCache.DEFAULT.getStringIndex(reader, groupField); + + // Clear ordSet and fill it with previous encountered groups that can occur in the current segment. + ordSet.clear(); + for (String countedGroup : countedGroups) { + int ord = index.binarySearchLookup(countedGroup); + if (ord >= 0) { + ordSet.put(ord); + } + } + } + + public boolean acceptsDocsOutOfOrder() { + return true; + } +} Index: lucene/contrib/grouping/src/test/org/apache/lucene/search/grouping/TestGrouping.java =================================================================== --- lucene/contrib/grouping/src/test/org/apache/lucene/search/grouping/TestGrouping.java (revision 1103038) +++ lucene/contrib/grouping/src/test/org/apache/lucene/search/grouping/TestGrouping.java (revision ) @@ -17,13 +17,7 @@ package org.apache.lucene.search.grouping; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collection; -import java.util.Collections; -import java.util.Comparator; -import java.util.HashMap; -import java.util.List; +import java.util.*; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.document.Document; @@ -32,14 +26,7 @@ import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.Term; -import org.apache.lucene.search.Collector; -import org.apache.lucene.search.FieldCache; -import org.apache.lucene.search.FieldDoc; -import org.apache.lucene.search.IndexSearcher; -import org.apache.lucene.search.ScoreDoc; -import org.apache.lucene.search.Sort; -import org.apache.lucene.search.SortField; -import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.*; import org.apache.lucene.store.Directory; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util._TestUtil; @@ -241,6 +228,7 @@ boolean fillFields, boolean getScores, boolean getMaxScores, + boolean doTotalGroupCount, Sort groupSort, Sort docSort, int topNGroups, @@ -256,6 +244,7 @@ final List sortedGroupFields = new ArrayList(); int totalHitCount = 0; + Set knownGroups = new HashSet(); for(GroupDoc d : groupDocs) { // TODO: would be better to filter by searchTerm before sorting! @@ -263,6 +252,12 @@ continue; } totalHitCount++; + if (doTotalGroupCount) { + if (!knownGroups.contains(d.group)) { + knownGroups.add(d.group); + } + } + List l = groups.get(d.group); if (l == null) { sortedGroups.add(d.group); @@ -315,8 +310,12 @@ fillFields ? sortedGroupFields.get(idx) : null); } - return new TopGroups(groupSort.getSort(), docSort.getSort(), totalHitCount, totalGroupedHitCount, result); + if (doTotalGroupCount) { + return new TopGroups(groupSort.getSort(), docSort.getSort(), totalHitCount, totalGroupedHitCount, result, knownGroups.size()); + } else { + return new TopGroups(groupSort.getSort(), docSort.getSort(), totalHitCount, totalGroupedHitCount, result, null); - } + } + } public void testRandom() throws Exception { for(int iter=0;iter<3;iter++) { @@ -428,10 +427,18 @@ //final int docOffset = 0; final boolean doCache = random.nextBoolean(); + final boolean doTotalGroupCount = random.nextBoolean(); if (VERBOSE) { - System.out.println("TEST: groupSort=" + groupSort + " docSort=" + docSort + " searchTerm=" + searchTerm + " topNGroups=" + topNGroups + " groupOffset=" + groupOffset + " docOffset=" + docOffset + " doCache=" + doCache + " docsPerGroup=" + docsPerGroup); + System.out.println("TEST: groupSort=" + groupSort + " docSort=" + docSort + " searchTerm=" + searchTerm + " topNGroups=" + topNGroups + " groupOffset=" + groupOffset + " docOffset=" + docOffset + " doCache=" + doCache + " docsPerGroup=" + docsPerGroup + " doTotalGroupCount=" + doTotalGroupCount); } + final TotalGroupCountCollector groupCountCollector; + if (doTotalGroupCount) { + groupCountCollector = new TotalGroupCountCollector("group"); + } else { + groupCountCollector = null; + } + final FirstPassGroupingCollector c1 = new FirstPassGroupingCollector("group", groupSort, groupOffset+topNGroups); final CachingCollector cCache; final Collector c; @@ -440,7 +447,16 @@ if (VERBOSE) { System.out.println("TEST: maxCacheMB=" + maxCacheMB); } + + if (doTotalGroupCount) { + cCache = new CachingCollector(c1, true, maxCacheMB); + c = MultiCollector.wrap(cCache, groupCountCollector); + } else { - c = cCache = new CachingCollector(c1, true, maxCacheMB); + c = cCache = new CachingCollector(c1, true, maxCacheMB); + } + } else if (doTotalGroupCount) { + c = MultiCollector.wrap(c1, groupCountCollector); + cCache = null; } else { c = c1; cCache = null; @@ -476,7 +492,12 @@ s.search(new TermQuery(new Term("content", searchTerm)), c2); } + if (doTotalGroupCount) { + TopGroups tempTopGroups = c2.getTopGroups(docOffset); + groupsResult = new TopGroups(tempTopGroups.groupSort, tempTopGroups.withinGroupSort, tempTopGroups.totalHitCount, tempTopGroups.totalGroupedHitCount, tempTopGroups.groups, groupCountCollector.getGroupCount()); + } else { - groupsResult = c2.getTopGroups(docOffset); + groupsResult = c2.getTopGroups(docOffset); + } } else { groupsResult = null; if (VERBOSE) { @@ -484,7 +505,7 @@ } } - final TopGroups expectedGroups = slowGrouping(groupDocs, searchTerm, fillFields, getScores, getMaxScores, groupSort, docSort, topNGroups, docsPerGroup, groupOffset, docOffset); + final TopGroups expectedGroups = slowGrouping(groupDocs, searchTerm, fillFields, getScores, getMaxScores, doTotalGroupCount, groupSort, docSort, topNGroups, docsPerGroup, groupOffset, docOffset); try { // NOTE: intentional but temporary field cache insanity! @@ -509,6 +530,9 @@ assertEquals(expected.groups.length, actual.groups.length); assertEquals(expected.totalHitCount, actual.totalHitCount); assertEquals(expected.totalGroupedHitCount, actual.totalGroupedHitCount); + if (expected.totalGroupCount != null) { + assertEquals(expected.totalGroupCount, actual.totalGroupCount); + } for(int groupIDX=0;groupIDXnull this value is not computed. */ + public final Integer totalGroupCount; + /** Group results in groupSort order */ public final GroupDocs[] groups; @@ -41,11 +41,12 @@ /** How docs are sorted within each group */ public final SortField[] withinGroupSort; - public TopGroups(SortField[] groupSort, SortField[] withinGroupSort, int totalHitCount, int totalGroupedHitCount, GroupDocs[] groups) { + public TopGroups(SortField[] groupSort, SortField[] withinGroupSort, int totalHitCount, int totalGroupedHitCount, GroupDocs[] groups, Integer totalGroupCount) { this.groupSort = groupSort; this.withinGroupSort = withinGroupSort; this.totalHitCount = totalHitCount; this.totalGroupedHitCount = totalGroupedHitCount; this.groups = groups; + this.totalGroupCount = totalGroupCount; } }