Index: modules/grouping/src/test/org/apache/lucene/search/grouping/DVDistinctCountCollectorTest.java =================================================================== --- modules/grouping/src/test/org/apache/lucene/search/grouping/DVDistinctCountCollectorTest.java (revision ) +++ modules/grouping/src/test/org/apache/lucene/search/grouping/DVDistinctCountCollectorTest.java (revision ) @@ -0,0 +1,254 @@ +package org.apache.lucene.search.grouping; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.codecs.lucene40.Lucene40Codec; +import org.apache.lucene.document.*; +import org.apache.lucene.index.DocValues; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.RandomIndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Sort; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.grouping.dv.DVDistinctCountCollector; +import org.apache.lucene.search.grouping.dv.DVFirstPassGroupingCollector; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.LuceneTestCase; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.List; + +public class DVDistinctCountCollectorTest extends LuceneTestCase { + + DocValues.Type[] dvTypes = new DocValues.Type[]{ + DocValues.Type.VAR_INTS, + DocValues.Type.FLOAT_64, + DocValues.Type.BYTES_VAR_STRAIGHT, + DocValues.Type.BYTES_VAR_SORTED + }; + + public void testBasic() throws Exception { + + final String groupField = "author"; + final String countField = "publisher"; + FieldType customType = new FieldType(); + customType.setStored(true); + DocValues.Type dvType = dvTypes[random.nextInt(dvTypes.length)]; + boolean diskResident = random.nextBoolean(); + + Directory dir = newDirectory(); + IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, + new MockAnalyzer(random)).setMergePolicy(newLogMergePolicy()); + iwc.setCodec(new Lucene40Codec()); + RandomIndexWriter w = new RandomIndexWriter(random, dir, iwc); + Document doc = new Document(); + addDocValue(doc, dvType, groupField, "1"); + addDocValue(doc, dvType, countField, "1"); + doc.add(new Field("content", "random text", TextField.TYPE_STORED)); + doc.add(new Field("id", "1", customType)); + w.addDocument(doc); + + // 1 + doc = new Document(); + addDocValue(doc, dvType, groupField, "1"); + addDocValue(doc, dvType, countField, "1"); + doc.add(new Field("content", "some more random text blob", TextField.TYPE_STORED)); + doc.add(new Field("id", "2", customType)); + w.addDocument(doc); + + // 2 + doc = new Document(); + addDocValue(doc, dvType, groupField, "1"); + addDocValue(doc, dvType, countField, "2"); + doc.add(new Field("content", "some more random textual data", TextField.TYPE_STORED)); + doc.add(new Field("id", "3", customType)); + w.addDocument(doc); + w.commit(); // To ensure a second segment + + // 3 + doc = new Document(); + addDocValue(doc, dvType, groupField, "2"); + doc.add(new Field("content", "some random text", TextField.TYPE_STORED)); + doc.add(new Field("id", "4", customType)); + w.addDocument(doc); + + // 4 + doc = new Document(); + addDocValue(doc, dvType, groupField, "3"); + addDocValue(doc, dvType, countField, "1"); + doc.add(new Field("content", "some more random text", TextField.TYPE_STORED)); + doc.add(new Field("id", "5", customType)); + w.addDocument(doc); + + // 5 + doc = new Document(); + addDocValue(doc, dvType, groupField, "3"); + addDocValue(doc, dvType, countField, "1"); + doc.add(new Field(groupField, "author3", TextField.TYPE_STORED)); + doc.add(new Field("content", "random blob", TextField.TYPE_STORED)); + doc.add(new Field("id", "6", customType)); + w.addDocument(doc); + + // 6 -- no author field + doc = new Document(); + doc.add(new Field("content", "random word stuck in alot of other text", TextField.TYPE_STORED)); + addDocValue(doc, dvType, countField, "1"); + doc.add(new Field("id", "6", customType)); + w.addDocument(doc); + + IndexSearcher indexSearcher = newSearcher(w.getReader()); + w.close(); + + Comparator>> cmp = new Comparator>>() { + + public int compare(AbstractDistinctCountCollector.GroupCount> comparableGroupCount, AbstractDistinctCountCollector.GroupCount> comparableGroupCount1) { + return comparableGroupCount.groupValue.compareTo(comparableGroupCount1.groupValue); + } + + }; + + // === Search for content:random + AbstractFirstPassGroupingCollector> firstCollector = DVFirstPassGroupingCollector.create(new Sort(), 10, groupField, dvType, diskResident); + indexSearcher.search(new TermQuery(new Term("content", "random")), firstCollector); + AbstractDistinctCountCollector>> distinctCountCollector + = DVDistinctCountCollector.create(groupField, countField, firstCollector.getTopGroups(0, false), diskResident, dvType); + indexSearcher.search(new TermQuery(new Term("content", "random")), distinctCountCollector); + + List>> gcs = distinctCountCollector.getGroups(); + Collections.sort(gcs, cmp); + assertEquals(4, gcs.size()); + + compareNull(gcs.get(0).groupValue); + List countValues = new ArrayList(gcs.get(0).uniqueValues); + assertEquals(1, countValues.size()); + compare("1", countValues.get(0)); + + compare("1", gcs.get(1).groupValue); + countValues = new ArrayList(gcs.get(1).uniqueValues); + assertEquals(2, countValues.size()); + compare("1", countValues.get(0)); + compare("2", countValues.get(1)); + + compare("2", gcs.get(2).groupValue); + countValues = new ArrayList(gcs.get(2).uniqueValues); + assertEquals(1, countValues.size()); + compareNull(countValues.get(0)); + + compare("3", gcs.get(3).groupValue); + countValues = new ArrayList(gcs.get(3).uniqueValues); + assertEquals(1, countValues.size()); + compare("1", countValues.get(0)); + + // === Search for content:some + firstCollector = DVFirstPassGroupingCollector.create(new Sort(), 10, groupField, dvType, diskResident); + indexSearcher.search(new TermQuery(new Term("content", "some")), firstCollector); + distinctCountCollector = DVDistinctCountCollector.create(groupField, countField, firstCollector.getTopGroups(0, false), diskResident, dvType); + indexSearcher.search(new TermQuery(new Term("content", "some")), distinctCountCollector); + + gcs = distinctCountCollector.getGroups(); + Collections.sort(gcs, cmp); + assertEquals(3, gcs.size()); + + compare("1", gcs.get(0).groupValue); + countValues = new ArrayList(gcs.get(0).uniqueValues); + assertEquals(2, countValues.size()); + compare("1", countValues.get(0)); + compare("2", countValues.get(1)); + + compare("2", gcs.get(1).groupValue); + countValues = new ArrayList(gcs.get(1).uniqueValues); + assertEquals(1, countValues.size()); + compareNull(countValues.get(0)); + + compare("3", gcs.get(2).groupValue); + countValues = new ArrayList(gcs.get(2).uniqueValues); + assertEquals(1, countValues.size()); + compare("1", countValues.get(0)); + + // === Search for content:blob + firstCollector = DVFirstPassGroupingCollector.create(new Sort(), 10, groupField, dvType, diskResident); + indexSearcher.search(new TermQuery(new Term("content", "blob")), firstCollector); + distinctCountCollector = DVDistinctCountCollector.create(groupField, countField, firstCollector.getTopGroups(0, false), diskResident, dvType); + indexSearcher.search(new TermQuery(new Term("content", "blob")), distinctCountCollector); + + gcs = distinctCountCollector.getGroups(); + Collections.sort(gcs, cmp); + assertEquals(2, gcs.size()); + + compare("1", gcs.get(0).groupValue); + countValues = new ArrayList(gcs.get(0).uniqueValues); + // B/c the only one document matched with blob inside the author 1 group + assertEquals(1, countValues.size()); + compare("1", countValues.get(0)); + + compare("3", gcs.get(1).groupValue); + countValues = new ArrayList(gcs.get(1).uniqueValues); + assertEquals(1, countValues.size()); + compare("1", countValues.get(0)); + + indexSearcher.getIndexReader().close(); + dir.close(); + } + + private void addDocValue(Document document, DocValues.Type type, String field, String val) { + DocValuesField valuesField = new DocValuesField(field); + switch (type) { + case VAR_INTS: + valuesField.setInt(Integer.parseInt(val)); + break; + case FLOAT_64: + valuesField.setFloat(Double.parseDouble(val)); + break; + case BYTES_VAR_STRAIGHT: + case BYTES_VAR_SORTED: + valuesField.setBytes(new BytesRef(val), type); + break; + } + document.add(valuesField); + } + + private void compare(String expected, Object groupValue) { + if (BytesRef.class.isAssignableFrom(groupValue.getClass())) { + assertEquals(expected, ((BytesRef) groupValue).utf8ToString()); + } else if (Double.class.isAssignableFrom(groupValue.getClass())) { + assertEquals(Double.parseDouble(expected), groupValue); + } else if (Long.class.isAssignableFrom(groupValue.getClass())) { + assertEquals(Long.parseLong(expected), groupValue); + } else { + fail(); + } + } + + private void compareNull(Object groupValue) { + if (BytesRef.class.isAssignableFrom(groupValue.getClass())) { + assertEquals("", ((BytesRef) groupValue).utf8ToString()); + } else if (Double.class.isAssignableFrom(groupValue.getClass())) { + assertEquals(0.0d, groupValue); + } else if (Long.class.isAssignableFrom(groupValue.getClass())) { + assertEquals(0L, groupValue); + } else { + fail(); + } + } + +} Index: modules/grouping/src/test/org/apache/lucene/search/grouping/TermDistinctCountCollectorTest.java =================================================================== --- modules/grouping/src/test/org/apache/lucene/search/grouping/TermDistinctCountCollectorTest.java (revision ) +++ modules/grouping/src/test/org/apache/lucene/search/grouping/TermDistinctCountCollectorTest.java (revision ) @@ -0,0 +1,194 @@ +package org.apache.lucene.search.grouping; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.FieldType; +import org.apache.lucene.document.TextField; +import org.apache.lucene.index.RandomIndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Sort; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.grouping.term.TermDistinctCountCollector; +import org.apache.lucene.search.grouping.term.TermFirstPassGroupingCollector; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.LuceneTestCase; + +import java.util.ArrayList; +import java.util.List; + +public class TermDistinctCountCollectorTest extends LuceneTestCase { + + public void testBasic() throws Exception { + + final String groupField = "author"; + final String countField = "publisher"; + FieldType customType = new FieldType(); + customType.setStored(true); + + Directory dir = newDirectory(); + RandomIndexWriter w = new RandomIndexWriter( + random, + dir, + newIndexWriterConfig(TEST_VERSION_CURRENT, + new MockAnalyzer(random)).setMergePolicy(newLogMergePolicy())); + // 0 + Document doc = new Document(); + doc.add(new Field(groupField, "author1", TextField.TYPE_STORED)); + doc.add(new Field("content", "random text", TextField.TYPE_STORED)); + doc.add(new Field(countField, "a", TextField.TYPE_STORED)); + doc.add(new Field("id", "1", customType)); + w.addDocument(doc); + + // 1 + doc = new Document(); + doc.add(new Field(groupField, "author1", TextField.TYPE_STORED)); + doc.add(new Field("content", "some more random text blob", TextField.TYPE_STORED)); + doc.add(new Field(countField, "a", TextField.TYPE_STORED)); + doc.add(new Field("id", "2", customType)); + w.addDocument(doc); + + // 2 + doc = new Document(); + doc.add(new Field(groupField, "author1", TextField.TYPE_STORED)); + doc.add(new Field("content", "some more random textual data", TextField.TYPE_STORED)); + doc.add(new Field(countField, "b", TextField.TYPE_STORED)); + doc.add(new Field("id", "3", customType)); + w.addDocument(doc); + w.commit(); // To ensure a second segment + + // 3 + doc = new Document(); + doc.add(new Field(groupField, "author2", TextField.TYPE_STORED)); + doc.add(new Field("content", "some random text", TextField.TYPE_STORED)); + doc.add(new Field("id", "4", customType)); + w.addDocument(doc); + + // 4 + doc = new Document(); + doc.add(new Field(groupField, "author3", TextField.TYPE_STORED)); + doc.add(new Field("content", "some more random text", TextField.TYPE_STORED)); + doc.add(new Field(countField, "a", TextField.TYPE_STORED)); + doc.add(new Field("id", "5", customType)); + w.addDocument(doc); + + // 5 + doc = new Document(); + doc.add(new Field(groupField, "author3", TextField.TYPE_STORED)); + doc.add(new Field("content", "random blob", TextField.TYPE_STORED)); + doc.add(new Field(countField, "a", TextField.TYPE_STORED)); + doc.add(new Field("id", "6", customType)); + w.addDocument(doc); + + // 6 -- no author field + doc = new Document(); + doc.add(new Field("content", "random word stuck in alot of other text", TextField.TYPE_STORED)); + doc.add(new Field(countField, "b", TextField.TYPE_STORED)); + doc.add(new Field("id", "6", customType)); + w.addDocument(doc); + + IndexSearcher indexSearcher = new IndexSearcher(w.getReader()); + w.close(); + + + // === Search for content:random + TermFirstPassGroupingCollector firstCollector = new TermFirstPassGroupingCollector(groupField, new Sort(), 10); + indexSearcher.search(new TermQuery(new Term("content", "random")), firstCollector); + AbstractDistinctCountCollector> distinctCountCollector + = new TermDistinctCountCollector(groupField, countField, firstCollector.getTopGroups(0, false)); + indexSearcher.search(new TermQuery(new Term("content", "random")), distinctCountCollector); + +// @SuppressWarnings("unchecked") + List> gcs = distinctCountCollector.getGroups(); + assertEquals(4, gcs.size()); + + assertEquals("author1", gcs.get(0).groupValue.utf8ToString()); + List countValues = new ArrayList(gcs.get(0).uniqueValues); + assertEquals(2, countValues.size()); + assertEquals("a", countValues.get(0).utf8ToString()); + assertEquals("b", countValues.get(1).utf8ToString()); + + assertEquals("author3", gcs.get(1).groupValue.utf8ToString()); + countValues = new ArrayList(gcs.get(1).uniqueValues); + assertEquals(1, countValues.size()); + assertEquals("a", countValues.get(0).utf8ToString()); + + assertEquals("author2", gcs.get(2).groupValue.utf8ToString()); + countValues = new ArrayList(gcs.get(2).uniqueValues); + assertEquals(1, countValues.size()); + assertNull(countValues.get(0)); + + assertNull(gcs.get(3).groupValue); + countValues = new ArrayList(gcs.get(3).uniqueValues); + assertEquals(1, countValues.size()); + assertEquals("b", countValues.get(0).utf8ToString()); + + // === Search for content:some + firstCollector = new TermFirstPassGroupingCollector(groupField, new Sort(), 10); + indexSearcher.search(new TermQuery(new Term("content", "some")), firstCollector); + distinctCountCollector = new TermDistinctCountCollector(groupField, countField, firstCollector.getTopGroups(0, false)); + indexSearcher.search(new TermQuery(new Term("content", "some")), distinctCountCollector); + + gcs = distinctCountCollector.getGroups(); + assertEquals(3, gcs.size()); + + assertEquals("author2", gcs.get(0).groupValue.utf8ToString()); + countValues = new ArrayList(gcs.get(0).uniqueValues); + assertEquals(1, countValues.size()); + assertNull(countValues.get(0)); + + assertEquals("author3", gcs.get(1).groupValue.utf8ToString()); + countValues = new ArrayList(gcs.get(1).uniqueValues); + assertEquals(1, countValues.size()); + assertEquals("a", countValues.get(0).utf8ToString()); + + assertEquals("author1", gcs.get(2).groupValue.utf8ToString()); + countValues = new ArrayList(gcs.get(2).uniqueValues); + assertEquals(2, countValues.size()); + assertEquals("a", countValues.get(0).utf8ToString()); + assertEquals("b", countValues.get(1).utf8ToString()); + + // === Search for content:blob + firstCollector = new TermFirstPassGroupingCollector(groupField, new Sort(), 10); + indexSearcher.search(new TermQuery(new Term("content", "blob")), firstCollector); + distinctCountCollector = new TermDistinctCountCollector(groupField, countField, firstCollector.getTopGroups(0, false)); + indexSearcher.search(new TermQuery(new Term("content", "blob")), distinctCountCollector); + + gcs = distinctCountCollector.getGroups(); + assertEquals(2, gcs.size()); + + assertEquals("author3", gcs.get(0).groupValue.utf8ToString()); + countValues = new ArrayList(gcs.get(0).uniqueValues); + assertEquals(1, countValues.size()); + assertEquals("a", countValues.get(0).utf8ToString()); + + assertEquals("author1", gcs.get(1).groupValue.utf8ToString()); + countValues = new ArrayList(gcs.get(1).uniqueValues); + // B/c the only one document matched with blob inside the author 1 group + assertEquals(1, countValues.size()); + assertEquals("a", countValues.get(0).utf8ToString()); + + + indexSearcher.getIndexReader().close(); + dir.close(); + } +} Index: modules/grouping/src/java/org/apache/lucene/search/grouping/term/TermDistinctCountCollector.java =================================================================== --- modules/grouping/src/java/org/apache/lucene/search/grouping/term/TermDistinctCountCollector.java (revision ) +++ modules/grouping/src/java/org/apache/lucene/search/grouping/term/TermDistinctCountCollector.java (revision ) @@ -0,0 +1,134 @@ +package org.apache.lucene.search.grouping.term; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.search.FieldCache; +import org.apache.lucene.search.grouping.AbstractDistinctCountCollector; +import org.apache.lucene.search.grouping.SearchGroup; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.SentinelIntSet; + +import java.io.IOException; +import java.util.*; + +/** + * A term based implementation of {@link AbstractDistinctCountCollector} that relies on {@link FieldCache.DocTermsIndex} + * to count the distinct values per group. + * + * @lucene.experimental + */ +public class TermDistinctCountCollector extends AbstractDistinctCountCollector { + + private final String groupField; + private final String countField; + private final List groups; + private final SentinelIntSet ordSet; + private final GroupCount groupCounts[]; + private final BytesRef spare = new BytesRef(); + + private FieldCache.DocTermsIndex groupFieldTermIndex; + private FieldCache.DocTermsIndex countFieldTermIndex; + + /** + * Constructs {@link TermDistinctCountCollector} instance. + * + * @param groupField The field to group by + * @param countField The field to count distinct values for + * @param groups The top N groups, collected during the first phase search + */ + public TermDistinctCountCollector(String groupField, String countField, Collection> groups) { + this.groupField = groupField; + this.countField = countField; + this.groups = new ArrayList(groups.size()); + for (SearchGroup group : groups) { + this.groups.add(new GroupCount(group.groupValue)); + } + ordSet = new SentinelIntSet(groups.size(), -1); + groupCounts = new GroupCount[ordSet.keys.length]; + } + + public void collect(int doc) throws IOException { + int slot = ordSet.find(groupFieldTermIndex.getOrd(doc)); + if (slot < 0) { + return; + } + + GroupCount gc = groupCounts[slot]; + int countOrd = countFieldTermIndex.getOrd(doc); + if (doesNotContainsOrd(countOrd, gc.ords)) { + if (countOrd == 0) { + gc.uniqueValues.add(null); + } else { + gc.uniqueValues.add(countFieldTermIndex.lookup(countOrd, new BytesRef())); + } + + gc.ords = Arrays.copyOf(gc.ords, gc.ords.length + 1); + gc.ords[gc.ords.length - 1] = countOrd; + if (gc.ords.length > 1) { + Arrays.sort(gc.ords); + } + } + } + + private boolean doesNotContainsOrd(int ord, int[] ords) { + if (ords.length == 0) { + return true; + } else if (ords.length == 1) { + return ord != ords[0]; + } + return Arrays.binarySearch(ords, ord) < 0; + } + + public List getGroups() { + return groups; + } + + public void setNextReader(IndexReader.AtomicReaderContext context) throws IOException { + groupFieldTermIndex = FieldCache.DEFAULT.getTermsIndex(context.reader, groupField); + countFieldTermIndex = FieldCache.DEFAULT.getTermsIndex(context.reader, countField); + + ordSet.clear(); + for (GroupCount group : groups) { + int groupOrd = group.groupValue == null ? 0 : groupFieldTermIndex.binarySearchLookup(group.groupValue, spare); + if (groupOrd < 0) { + continue; + } + + groupCounts[ordSet.put(groupOrd)] = group; + group.ords = new int[group.uniqueValues.size()]; + int i = 0; + for (BytesRef value : group.uniqueValues) { + int countOrd = value == null ? 0 : countFieldTermIndex.binarySearchLookup(value, spare); + if (countOrd >= 0) { + group.ords[i++] = countOrd; + } + } + } + } + + static class GroupCount extends AbstractDistinctCountCollector.GroupCount { + + int[] ords; + + GroupCount(BytesRef groupValue) { + super(groupValue); + } + } + +} Index: modules/grouping/src/java/org/apache/lucene/search/grouping/AbstractDistinctCountCollector.java =================================================================== --- modules/grouping/src/java/org/apache/lucene/search/grouping/AbstractDistinctCountCollector.java (revision ) +++ modules/grouping/src/java/org/apache/lucene/search/grouping/AbstractDistinctCountCollector.java (revision ) @@ -0,0 +1,63 @@ +package org.apache.lucene.search.grouping; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.search.Collector; +import org.apache.lucene.search.Scorer; + +import java.io.IOException; +import java.util.Collection; +import java.util.List; +import java.util.SortedSet; +import java.util.TreeSet; + +/** + * A second pass grouping collector that counts distinct values for a specified field + * per top N group. + * + * @lucene.experimental + */ +public abstract class AbstractDistinctCountCollector extends Collector { + + /** + * Returns the + * + * @return + */ + public abstract List getGroups(); + + public boolean acceptsDocsOutOfOrder() { + return true; + } + + public void setScorer(Scorer scorer) throws IOException { + } + + public abstract static class GroupCount { + + public final GROUP_VALUE_TYPE groupValue; + public final SortedSet uniqueValues; + + public GroupCount(GROUP_VALUE_TYPE groupValue) { + this.groupValue = groupValue; + uniqueValues = new TreeSet(); + } + + } + +} Index: modules/grouping/src/java/org/apache/lucene/search/grouping/dv/DVFirstPassGroupingCollector.java =================================================================== --- modules/grouping/src/java/org/apache/lucene/search/grouping/dv/DVFirstPassGroupingCollector.java (revision 1224605) +++ modules/grouping/src/java/org/apache/lucene/search/grouping/dv/DVFirstPassGroupingCollector.java (revision ) @@ -37,25 +37,30 @@ final boolean diskResident; final DocValues.Type valueType; - public static DVFirstPassGroupingCollector create(Sort groupSort, int topNGroups, String groupField, DocValues.Type type, boolean diskResident) throws IOException { + @SuppressWarnings("unchecked") + public static DVFirstPassGroupingCollector create(Sort groupSort, int topNGroups, String groupField, DocValues.Type type, boolean diskResident) throws IOException { switch (type) { case VAR_INTS: case FIXED_INTS_8: case FIXED_INTS_16: case FIXED_INTS_32: case FIXED_INTS_64: - return new Lng(groupSort, topNGroups, groupField, diskResident, type); + // Type erasure b/c otherwise we have inconvertible types... + return (DVFirstPassGroupingCollector) new Lng(groupSort, topNGroups, groupField, diskResident, type); case FLOAT_32: case FLOAT_64: - return new Dbl(groupSort, topNGroups, groupField, diskResident, type); + // Type erasure b/c otherwise we have inconvertible types... + return (DVFirstPassGroupingCollector) new Dbl(groupSort, topNGroups, groupField, diskResident, type); case BYTES_FIXED_STRAIGHT: case BYTES_FIXED_DEREF: case BYTES_VAR_STRAIGHT: case BYTES_VAR_DEREF: - return new BR(groupSort, topNGroups, groupField, diskResident, type); + // Type erasure b/c otherwise we have inconvertible types... + return (DVFirstPassGroupingCollector) new BR(groupSort, topNGroups, groupField, diskResident, type); case BYTES_VAR_SORTED: case BYTES_FIXED_SORTED: - return new SortedBR(groupSort, topNGroups, groupField, diskResident, type); + // Type erasure b/c otherwise we have inconvertible types... + return (DVFirstPassGroupingCollector) new SortedBR(groupSort, topNGroups, groupField, diskResident, type); default: throw new IllegalArgumentException(String.format("ValueType %s not supported", type)); } Index: modules/grouping/src/java/org/apache/lucene/search/grouping/dv/DVDistinctCountCollector.java =================================================================== --- modules/grouping/src/java/org/apache/lucene/search/grouping/dv/DVDistinctCountCollector.java (revision ) +++ modules/grouping/src/java/org/apache/lucene/search/grouping/dv/DVDistinctCountCollector.java (revision ) @@ -0,0 +1,296 @@ +package org.apache.lucene.search.grouping.dv; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.index.DocValues; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.search.grouping.AbstractDistinctCountCollector; +import org.apache.lucene.search.grouping.SearchGroup; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.SentinelIntSet; +import org.apache.lucene.index.DocValues.Type; // javadocs + +import java.io.IOException; +import java.util.*; + +/** + * Docvalues implementation of {@link AbstractDistinctCountCollector}. + * + * @lucene.experimental + */ +public abstract class DVDistinctCountCollector extends AbstractDistinctCountCollector { + + final String groupField; + final String countField; + final boolean diskResident; + final Type valueType; + + DVDistinctCountCollector(String groupField, String countField, boolean diskResident, Type valueType) { + this.groupField = groupField; + this.countField = countField; + this.diskResident = diskResident; + this.valueType = valueType; + } + + /** + * Constructs a docvalues based implementation of {@link AbstractDistinctCountCollector} based on the specified + * type. + * + * @param groupField The field to group by + * @param countField The field to count distinct values for + * @param groups The top N groups, collected during the first phase search + * @param diskResident Whether the values to group and count by should be disk resident + * @param type The {@link Type} which is used to select a concrete implementation + * @return a docvalues based distinct count collector + */ + @SuppressWarnings("unchecked") + public static DVDistinctCountCollector> create(String groupField, String countField, Collection> groups, boolean diskResident, Type type) { + switch (type) { + case VAR_INTS: + case FIXED_INTS_8: + case FIXED_INTS_16: + case FIXED_INTS_32: + case FIXED_INTS_64: + // Type erasure b/c otherwise we have inconvertible types... + return (DVDistinctCountCollector) new NonSorted.Lng(groupField, countField, (Collection) groups, diskResident, type); + case FLOAT_32: + case FLOAT_64: + // Type erasure b/c otherwise we have inconvertible types... + return (DVDistinctCountCollector) new NonSorted.Dbl(groupField, countField, (Collection) groups, diskResident, type); + case BYTES_FIXED_STRAIGHT: + case BYTES_FIXED_DEREF: + case BYTES_VAR_STRAIGHT: + case BYTES_VAR_DEREF: + // Type erasure b/c otherwise we have inconvertible types... + return (DVDistinctCountCollector) new NonSorted.BR(groupField, countField, (Collection) groups, diskResident, type); + case BYTES_VAR_SORTED: + case BYTES_FIXED_SORTED: + // Type erasure b/c otherwise we have inconvertible types... + return (DVDistinctCountCollector) new Sorted.BR(groupField, countField, (Collection) groups, diskResident, type); + default: + throw new IllegalArgumentException(String.format("ValueType %s not supported", type)); + } + } + + + static abstract class NonSorted extends DVDistinctCountCollector { + + final Map groupMap = new HashMap(); + + DocValues.Source groupFieldSource; + DocValues.Source countFieldSource; + + NonSorted(String groupField, String countField, boolean diskResident, Type valueType) { + super(groupField, countField, diskResident, valueType); + } + + public List getGroups() { + return new ArrayList(groupMap.values()); + } + + public void setNextReader(IndexReader.AtomicReaderContext context) throws IOException { + groupFieldSource = retrieveSource(groupField, context); + countFieldSource = retrieveSource(countField, context); + } + + private DocValues.Source retrieveSource(String fieldName, IndexReader.AtomicReaderContext context) throws IOException { + DocValues groupFieldDv = context.reader.docValues(fieldName); + if (groupFieldDv != null) { + return diskResident ? groupFieldDv.getDirectSource() : groupFieldDv.getSource(); + } else { + return DocValues.getDefaultSource(valueType); + } + } + + static class Dbl extends NonSorted { + + Dbl(String groupField, String countField, Collection> groups, boolean diskResident, Type valueType) { + super(groupField, countField, diskResident, valueType); + for (SearchGroup group : groups) { + groupMap.put(group.groupValue, new GroupCount(group.groupValue)); + } + } + + public void collect(int doc) throws IOException { + GroupCount groupCount = groupMap.get(groupFieldSource.getFloat(doc)); + if (groupCount != null) { + groupCount.uniqueValues.add(countFieldSource.getFloat(doc)); + } + } + + } + + static class Lng extends NonSorted { + + Lng(String groupField, String countField, Collection> groups, boolean diskResident, Type valueType) { + super(groupField, countField, diskResident, valueType); + for (SearchGroup group : groups) { + groupMap.put(group.groupValue, new GroupCount(group.groupValue)); + } + } + + public void collect(int doc) throws IOException { + GroupCount groupCount = groupMap.get(groupFieldSource.getInt(doc)); + if (groupCount != null) { + groupCount.uniqueValues.add(countFieldSource.getInt(doc)); + } + } + + } + + static class BR extends NonSorted { + + private final BytesRef spare = new BytesRef(); + + BR(String groupField, String countField, Collection> groups, boolean diskResident, Type valueType) { + super(groupField, countField, diskResident, valueType); + for (SearchGroup group : groups) { + groupMap.put(group.groupValue, new GroupCount(group.groupValue)); + } + } + + public void collect(int doc) throws IOException { + GroupCount groupCount = groupMap.get(groupFieldSource.getBytes(doc, spare)); + if (groupCount != null) { + BytesRef countValue = countFieldSource.getBytes(doc, spare); + if (!groupCount.uniqueValues.contains(countValue)) { + groupCount.uniqueValues.add(BytesRef.deepCopyOf(countValue)); + } + } + } + + } + + static class GroupCount extends AbstractDistinctCountCollector.GroupCount { + + GroupCount(Comparable groupValue) { + super(groupValue); + } + + } + + } + + + static abstract class Sorted extends DVDistinctCountCollector { + + final SentinelIntSet ordSet; + final GroupCount groupCounts[]; + final List groups = new ArrayList(); + + DocValues.SortedSource groupFieldSource; + DocValues.SortedSource countFieldSource; + + Sorted(String groupField, String countField, int groupSize, boolean diskResident, Type valueType) { + super(groupField, countField, diskResident, valueType); + ordSet = new SentinelIntSet(groupSize, -1); + groupCounts = new GroupCount[ordSet.keys.length]; + } + + public List getGroups() { + return groups; + } + + public void setNextReader(IndexReader.AtomicReaderContext context) throws IOException { + groupFieldSource = retrieveSortedSource(groupField, context); + countFieldSource = retrieveSortedSource(countField, context); + ordSet.clear(); + } + + private DocValues.SortedSource retrieveSortedSource(String field, IndexReader.AtomicReaderContext context) throws IOException { + DocValues countFieldDv = context.reader.docValues(field); + if (countFieldDv != null) { + return diskResident ? countFieldDv.getDirectSource().asSortedSource() : countFieldDv.getSource().asSortedSource(); + } else { + return DocValues.getDefaultSortedSource(valueType, context.reader.maxDoc()); + } + } + + static class BR extends Sorted { + + final BytesRef spare = new BytesRef(); + + BR(String groupField, String countField, Collection> searchGroups, boolean diskResident, Type valueType) { + super(groupField, countField, searchGroups.size(), diskResident, valueType); + for (SearchGroup group : searchGroups) { + this.groups.add(new GroupCount(group.groupValue)); + } + } + + public void collect(int doc) throws IOException { + int slot = ordSet.find(groupFieldSource.ord(doc)); + if (slot < 0) { + return; + } + + GroupCount gc = groupCounts[slot]; + int countOrd = countFieldSource.ord(doc); + if (doesNotContainsOrd(countOrd, gc.ords)) { + gc.uniqueValues.add(countFieldSource.getByOrd(countOrd, new BytesRef())); + gc.ords = Arrays.copyOf(gc.ords, gc.ords.length + 1); + gc.ords[gc.ords.length - 1] = countOrd; + if (gc.ords.length > 1) { + Arrays.sort(gc.ords); + } + } + } + + private boolean doesNotContainsOrd(int ord, int[] ords) { + if (ords.length == 0) { + return true; + } else if (ords.length == 1) { + return ord != ords[0]; + } + return Arrays.binarySearch(ords, ord) < 0; + } + + @Override + public void setNextReader(IndexReader.AtomicReaderContext context) throws IOException { + super.setNextReader(context); + for (GroupCount group : groups) { + int groupOrd = groupFieldSource.getByValue((BytesRef) group.groupValue, spare); + if (groupOrd < 0) { + continue; + } + + groupCounts[ordSet.put(groupOrd)] = group; + group.ords = new int[group.uniqueValues.size()]; + int i = 0; + for (Comparable value : group.uniqueValues) { + int countOrd = countFieldSource.getByValue((BytesRef) value, spare); + if (countOrd >= 0) { + group.ords[i++] = countOrd; + } + } + } + } + } + + static class GroupCount extends AbstractDistinctCountCollector.GroupCount { + + int[] ords; + + GroupCount(Comparable groupValue) { + super(groupValue); + } + + } + + } + +}