Index: modules/grouping/src/java/org/apache/lucene/search/grouping/AbstractDistinctValuesCollector.java IDEA additional info: Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP <+>UTF-8 =================================================================== --- modules/grouping/src/java/org/apache/lucene/search/grouping/AbstractDistinctValuesCollector.java (revision ) +++ modules/grouping/src/java/org/apache/lucene/search/grouping/AbstractDistinctValuesCollector.java (revision ) @@ -0,0 +1,80 @@ +package org.apache.lucene.search.grouping; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.search.Collector; +import org.apache.lucene.search.Scorer; + +import java.io.IOException; +import java.util.Comparator; +import java.util.List; +import java.util.SortedSet; +import java.util.TreeSet; + +/** + * A second pass grouping collector that keeps track of distinct values for a specified field for the top N group. + * + * @lucene.experimental + */ +public abstract class AbstractDistinctValuesCollector extends Collector { + + /** + * Returns all unique values for each top N group. + * + * @return all unique values for each top N group + */ + public abstract List getGroups(); + + public boolean acceptsDocsOutOfOrder() { + return true; + } + + public void setScorer(Scorer scorer) throws IOException { + } + + public abstract static class GroupCount { + + public final GROUP_VALUE_TYPE groupValue; + public final SortedSet uniqueValues; + + public GroupCount(GROUP_VALUE_TYPE groupValue) { + this(groupValue, new TreeSet(new Comparator() { + + @SuppressWarnings({"unchecked","rawtypes"}) + public int compare(GROUP_VALUE_TYPE a, GROUP_VALUE_TYPE b) { + if (a == b) { + return 0; + } else if (a == null) { + return -1; + } else if (b == null) { + return 1; + } else { + return a.compareTo(b); + } + } + + })); + } + + public GroupCount(GROUP_VALUE_TYPE groupValue, SortedSet uniqueValues) { + this.groupValue = groupValue; + this.uniqueValues = uniqueValues; + } + } + +} Index: modules/grouping/src/test/org/apache/lucene/search/grouping/DistinctValuesCollectorTest.java IDEA additional info: Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP <+>UTF-8 =================================================================== --- modules/grouping/src/test/org/apache/lucene/search/grouping/DistinctValuesCollectorTest.java (revision ) +++ modules/grouping/src/test/org/apache/lucene/search/grouping/DistinctValuesCollectorTest.java (revision ) @@ -0,0 +1,511 @@ +package org.apache.lucene.search.grouping; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.document.*; +import org.apache.lucene.index.*; +import org.apache.lucene.queries.function.valuesource.BytesRefFieldSource; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Sort; +import org.apache.lucene.search.SortField; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.grouping.dv.DVDistinctValuesCollector; +import org.apache.lucene.search.grouping.dv.DVFirstPassGroupingCollector; +import org.apache.lucene.search.grouping.function.FunctionDistinctValuesCollector; +import org.apache.lucene.search.grouping.function.FunctionFirstPassGroupingCollector; +import org.apache.lucene.search.grouping.term.TermDistinctValuesCollector; +import org.apache.lucene.search.grouping.term.TermFirstPassGroupingCollector; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util._TestUtil; +import org.apache.lucene.util.mutable.MutableValue; +import org.apache.lucene.util.mutable.MutableValueStr; + +import java.io.IOException; +import java.util.*; + +public class DistinctValuesCollectorTest extends AbstractGroupingTestCase { + + private final String groupField = "author"; + private final String countField = "publisher"; + + public void testSimple() throws Exception { + DocValues.Type[] dvTypes = new DocValues.Type[]{ + DocValues.Type.VAR_INTS, + DocValues.Type.FLOAT_64, + DocValues.Type.BYTES_VAR_STRAIGHT, + DocValues.Type.BYTES_VAR_SORTED + }; + Directory dir = newDirectory(); + RandomIndexWriter w = new RandomIndexWriter( + random, + dir, + newIndexWriterConfig(TEST_VERSION_CURRENT, + new MockAnalyzer(random)).setMergePolicy(newLogMergePolicy())); + boolean canUseDV = !"Lucene3x".equals(w.w.getConfig().getCodec().getName()); + DocValues.Type dvType = canUseDV ? dvTypes[random.nextInt(dvTypes.length)] : null; + + Document doc = new Document(); + addField(doc, groupField, "1", dvType); + addField(doc, countField, "1", dvType); + doc.add(new Field("content", "random text", TextField.TYPE_UNSTORED)); + doc.add(new Field("id", "1", StringField.TYPE_UNSTORED)); + w.addDocument(doc); + + // 1 + doc = new Document(); + addField(doc, groupField, "1", dvType); + addField(doc, countField, "1", dvType); + doc.add(new Field("content", "some more random text blob", TextField.TYPE_UNSTORED)); + doc.add(new Field("id", "2", StringField.TYPE_UNSTORED)); + w.addDocument(doc); + + // 2 + doc = new Document(); + addField(doc, groupField, "1", dvType); + addField(doc, countField, "2", dvType); + doc.add(new Field("content", "some more random textual data", TextField.TYPE_UNSTORED)); + doc.add(new Field("id", "3", StringField.TYPE_UNSTORED)); + w.addDocument(doc); + w.commit(); // To ensure a second segment + + // 3 + doc = new Document(); + addField(doc, groupField, "2", dvType); + doc.add(new Field("content", "some random text", TextField.TYPE_UNSTORED)); + doc.add(new Field("id", "4", StringField.TYPE_UNSTORED)); + w.addDocument(doc); + + // 4 + doc = new Document(); + addField(doc, groupField, "3", dvType); + addField(doc, countField, "1", dvType); + doc.add(new Field("content", "some more random text", TextField.TYPE_UNSTORED)); + doc.add(new Field("id", "5", StringField.TYPE_UNSTORED)); + w.addDocument(doc); + + // 5 + doc = new Document(); + addField(doc, groupField, "3", dvType); + addField(doc, countField, "1", dvType); + doc.add(new Field("content", "random blob", TextField.TYPE_UNSTORED)); + doc.add(new Field("id", "6", StringField.TYPE_UNSTORED)); + w.addDocument(doc); + + // 6 -- no author field + doc = new Document(); + doc.add(new Field("content", "random word stuck in alot of other text", TextField.TYPE_STORED)); + addField(doc, countField, "1", dvType); + doc.add(new Field("id", "6", StringField.TYPE_UNSTORED)); + w.addDocument(doc); + + IndexSearcher indexSearcher = newSearcher(w.getReader()); + w.close(); + + Comparator>> cmp = new Comparator>>() { + + public int compare(AbstractDistinctValuesCollector.GroupCount> groupCount1, AbstractDistinctValuesCollector.GroupCount> groupCount2) { + if (groupCount1.groupValue == null) { + if (groupCount2.groupValue == null) { + return 0; + } + return -1; + } else if (groupCount2.groupValue == null) { + return 1; + } else { + return groupCount1.groupValue.compareTo(groupCount2.groupValue); + } + } + + }; + + // === Search for content:random + AbstractFirstPassGroupingCollector> firstCollector = createRandomFirstPassCollector(dvType, new Sort(), groupField, 10); + indexSearcher.search(new TermQuery(new Term("content", "random")), firstCollector); + AbstractDistinctValuesCollector>> distinctValuesCollector + = createDistinctCountCollector(firstCollector, groupField, countField, dvType); + indexSearcher.search(new TermQuery(new Term("content", "random")), distinctValuesCollector); + + List>> gcs = distinctValuesCollector.getGroups(); + Collections.sort(gcs, cmp); + assertEquals(4, gcs.size()); + + compareNull(gcs.get(0).groupValue); + List countValues = new ArrayList(gcs.get(0).uniqueValues); + assertEquals(1, countValues.size()); + compare("1", countValues.get(0)); + + compare("1", gcs.get(1).groupValue); + countValues = new ArrayList(gcs.get(1).uniqueValues); + assertEquals(2, countValues.size()); + compare("1", countValues.get(0)); + compare("2", countValues.get(1)); + + compare("2", gcs.get(2).groupValue); + countValues = new ArrayList(gcs.get(2).uniqueValues); + assertEquals(1, countValues.size()); + compareNull(countValues.get(0)); + + compare("3", gcs.get(3).groupValue); + countValues = new ArrayList(gcs.get(3).uniqueValues); + assertEquals(1, countValues.size()); + compare("1", countValues.get(0)); + + // === Search for content:some + firstCollector = createRandomFirstPassCollector(dvType, new Sort(), groupField, 10); + indexSearcher.search(new TermQuery(new Term("content", "some")), firstCollector); + distinctValuesCollector = createDistinctCountCollector(firstCollector, groupField, countField, dvType); + indexSearcher.search(new TermQuery(new Term("content", "some")), distinctValuesCollector); + + gcs = distinctValuesCollector.getGroups(); + Collections.sort(gcs, cmp); + assertEquals(3, gcs.size()); + + compare("1", gcs.get(0).groupValue); + countValues = new ArrayList(gcs.get(0).uniqueValues); + assertEquals(2, countValues.size()); + compare("1", countValues.get(0)); + compare("2", countValues.get(1)); + + compare("2", gcs.get(1).groupValue); + countValues = new ArrayList(gcs.get(1).uniqueValues); + assertEquals(1, countValues.size()); + compareNull(countValues.get(0)); + + compare("3", gcs.get(2).groupValue); + countValues = new ArrayList(gcs.get(2).uniqueValues); + assertEquals(1, countValues.size()); + compare("1", countValues.get(0)); + + // === Search for content:blob + firstCollector = createRandomFirstPassCollector(dvType, new Sort(), groupField, 10); + indexSearcher.search(new TermQuery(new Term("content", "blob")), firstCollector); + distinctValuesCollector = createDistinctCountCollector(firstCollector, groupField, countField, dvType); + indexSearcher.search(new TermQuery(new Term("content", "blob")), distinctValuesCollector); + + gcs = distinctValuesCollector.getGroups(); + Collections.sort(gcs, cmp); + assertEquals(2, gcs.size()); + + compare("1", gcs.get(0).groupValue); + countValues = new ArrayList(gcs.get(0).uniqueValues); + // B/c the only one document matched with blob inside the author 1 group + assertEquals(1, countValues.size()); + compare("1", countValues.get(0)); + + compare("3", gcs.get(1).groupValue); + countValues = new ArrayList(gcs.get(1).uniqueValues); + assertEquals(1, countValues.size()); + compare("1", countValues.get(0)); + + indexSearcher.getIndexReader().close(); + dir.close(); + } + + public void testRandom() throws Exception { + int numberOfRuns = _TestUtil.nextInt(random, 3, 6); + for (int indexIter = 0; indexIter < numberOfRuns; indexIter++) { + IndexContext context = createIndexContext(); + for (int searchIter = 0; searchIter < 100; searchIter++) { + final IndexSearcher searcher = newSearcher(context.indexReader); + boolean useDv = context.dvType != null && random.nextBoolean(); + DocValues.Type dvType = useDv ? context.dvType : null; + String term = context.contentStrings[random.nextInt(context.contentStrings.length)]; + Sort groupSort = new Sort(new SortField("id", SortField.Type.STRING));//Sort.INDEXORDER; + int topN = 10; + + List> expectedResult = createExpectedResult(context, term, groupSort, topN); + + AbstractFirstPassGroupingCollector> firstCollector = createRandomFirstPassCollector(dvType, groupSort, groupField, topN); + searcher.search(new TermQuery(new Term("content", term)), firstCollector); + AbstractDistinctValuesCollector>> distinctValuesCollector + = createDistinctCountCollector(firstCollector, groupField, countField, dvType); + searcher.search(new TermQuery(new Term("content", term)), distinctValuesCollector); + @SuppressWarnings("unchecked") + List>> actualResult = (List>>) distinctValuesCollector.getGroups(); + + if (VERBOSE) { + } + System.out.println("Collector class name=" + distinctValuesCollector.getClass().getName()); + + assertEquals(expectedResult.size(), actualResult.size()); + for (int i = 0; i < expectedResult.size(); i++) { + AbstractDistinctValuesCollector.GroupCount expected = expectedResult.get(i); + AbstractDistinctValuesCollector.GroupCount> actual = actualResult.get(i); + assertValues(expected.groupValue, actual.groupValue); + assertEquals(expected.uniqueValues.size(), actual.uniqueValues.size()); + List expectedUniqueValues = new ArrayList(expected.uniqueValues); + List actualUniqueValues = new ArrayList(actual.uniqueValues); + for (int j = 0; j < expected.uniqueValues.size(); j++) { + assertValues(expectedUniqueValues.get(j), actualUniqueValues.get(j)); + } + } + } + context.indexReader.close(); + context.directory.close(); + } + } + + private void assertValues(Object expected, Object actual) { + if (expected == null) { + compareNull(actual); + } else { + compare(((BytesRef) expected).utf8ToString(), actual); + } + } + + private void compare(String expected, Object groupValue) { + if (BytesRef.class.isAssignableFrom(groupValue.getClass())) { + assertEquals(expected, ((BytesRef) groupValue).utf8ToString()); + } else if (Double.class.isAssignableFrom(groupValue.getClass())) { + assertEquals(Double.parseDouble(expected), groupValue); + } else if (Long.class.isAssignableFrom(groupValue.getClass())) { + assertEquals(Long.parseLong(expected), groupValue); + } else if (MutableValue.class.isAssignableFrom(groupValue.getClass())) { + MutableValueStr mutableValue = new MutableValueStr(); + mutableValue.value = new BytesRef(expected); + assertEquals(mutableValue, groupValue); + } else { + fail(); + } + } + + private void compareNull(Object groupValue) { + if (groupValue == null) { + return; // term based impl... + } + // DV based impls.. + if (BytesRef.class.isAssignableFrom(groupValue.getClass())) { + assertEquals("", ((BytesRef) groupValue).utf8ToString()); + } else if (Double.class.isAssignableFrom(groupValue.getClass())) { + assertEquals(0.0d, groupValue); + } else if (Long.class.isAssignableFrom(groupValue.getClass())) { + assertEquals(0L, groupValue); + // Function based impl + } else if (MutableValue.class.isAssignableFrom(groupValue.getClass())) { + assertFalse(((MutableValue) groupValue).exists()); + } else { + fail(); + } + } + + private void addField(Document doc, String field, String value, DocValues.Type type) { + doc.add(new Field(field, value, StringField.TYPE_UNSTORED)); + if (type == null) { + return; + } + + DocValuesField valuesField = null; + switch (type) { + case VAR_INTS: + valuesField = new DocValuesField(field, Integer.parseInt(value), type); + break; + case FLOAT_64: + valuesField = new DocValuesField(field, Double.parseDouble(value), type); + break; + case BYTES_VAR_STRAIGHT: + case BYTES_VAR_SORTED: + valuesField = new DocValuesField(field, new BytesRef(value), type); + break; + } + doc.add(valuesField); + } + + @SuppressWarnings({"unchecked","rawtypes"}) + private AbstractDistinctValuesCollector> createDistinctCountCollector(AbstractFirstPassGroupingCollector firstPassGroupingCollector, + String groupField, + String countField, + DocValues.Type dvType) { + Collection> searchGroups = firstPassGroupingCollector.getTopGroups(0, false); + if (DVFirstPassGroupingCollector.class.isAssignableFrom(firstPassGroupingCollector.getClass())) { + boolean diskResident = random.nextBoolean(); + return DVDistinctValuesCollector.create(groupField, countField, searchGroups, diskResident, dvType); + } else if (FunctionFirstPassGroupingCollector.class.isAssignableFrom(firstPassGroupingCollector.getClass())) { + return (AbstractDistinctValuesCollector) new FunctionDistinctValuesCollector(new HashMap(), new BytesRefFieldSource(groupField), new BytesRefFieldSource(countField), (Collection) searchGroups); + } else { + return (AbstractDistinctValuesCollector) new TermDistinctValuesCollector(groupField, countField, (Collection) searchGroups); + } + } + + @SuppressWarnings({"unchecked","rawtypes"}) + private AbstractFirstPassGroupingCollector createRandomFirstPassCollector(DocValues.Type dvType, Sort groupSort, String groupField, int topNGroups) throws IOException { + if (dvType != null) { + if (random.nextBoolean()) { + boolean diskResident = random.nextBoolean(); + return DVFirstPassGroupingCollector.create(groupSort, topNGroups, groupField, dvType, diskResident); + } /*else if (random.nextBoolean()) { + return (AbstractFirstPassGroupingCollector) new FunctionFirstPassGroupingCollector(new BytesRefFieldSource(groupField), new HashMap(), groupSort, topNGroups); + }*/ else { + return (AbstractFirstPassGroupingCollector) new TermFirstPassGroupingCollector(groupField, groupSort, topNGroups); + } + } else { + return (AbstractFirstPassGroupingCollector) new TermFirstPassGroupingCollector(groupField, groupSort, topNGroups); +// if (random.nextBoolean()) { +// return (AbstractFirstPassGroupingCollector) new FunctionFirstPassGroupingCollector(new BytesRefFieldSource(groupField), new HashMap(), groupSort, topNGroups); +// } else { +// return (AbstractFirstPassGroupingCollector) new TermFirstPassGroupingCollector(groupField, groupSort, topNGroups); +// } + } + } + + @SuppressWarnings({"unchecked","rawtypes"}) + private List> createExpectedResult(IndexContext context, String term, Sort groupSort, int topN) { + class NullByteRefComparator implements Comparator { + + public int compare(BytesRef a, BytesRef b) { + if (a == b) { + return 0; + } else if (a == null) { + return -1; + } else if (b == null) { + return 1; + } else { + return a.compareTo(b); + } + } + + } + + class GroupCount extends AbstractDistinctValuesCollector.GroupCount { + GroupCount(BytesRef groupValue, Collection uniqueValues) { + super(groupValue, new TreeSet(new NullByteRefComparator())); + this.uniqueValues.addAll(uniqueValues); + } + } + + List result = new ArrayList(); + Map> groupCounts = context.searchTermToGroupCounts.get(term); + int i = 0; + for (String group : groupCounts.keySet()) { + if (topN <= i++) { + break; + } + TreeSet uniqueValues = new TreeSet(new NullByteRefComparator()); + for (String val : groupCounts.get(group)) { + uniqueValues.add(val != null ? new BytesRef(val) : null); + } + result.add(new GroupCount(group != null ? new BytesRef(group) : null, uniqueValues)); + } + return result; + } + + private IndexContext createIndexContext() throws Exception { + DocValues.Type[] dvTypes = new DocValues.Type[]{ + DocValues.Type.BYTES_VAR_STRAIGHT, + DocValues.Type.BYTES_VAR_SORTED + }; + + Directory dir = newDirectory(); + RandomIndexWriter w = new RandomIndexWriter( + random, + dir, + newIndexWriterConfig(TEST_VERSION_CURRENT, + new MockAnalyzer(random)).setMergePolicy(newLogMergePolicy())); + boolean canUseDV = !"Lucene3x".equals(w.w.getConfig().getCodec().getName()); + DocValues.Type dvType = canUseDV ? dvTypes[random.nextInt(dvTypes.length)] : null; + + int numDocs = 86 + random.nextInt(1087) * RANDOM_MULTIPLIER; + String[] groupValues = new String[numDocs / 5]; + String[] countValues = new String[numDocs / 10]; + for (int i = 0; i < groupValues.length; i++) { + groupValues[i] = generateRandomNonEmptyString(); + } + for (int i = 0; i < countValues.length; i++) { + countValues[i] = generateRandomNonEmptyString(); + } + + List contentStrings = new ArrayList(); + Map>> searchTermToGroupCounts = new HashMap>>(); + for (int i = 1; i <= numDocs; i++) { + String groupValue = random.nextInt(23) == 14 ? null : groupValues[random.nextInt(groupValues.length)]; + String countValue = random.nextInt(21) == 13 ? null : countValues[random.nextInt(countValues.length)]; + String content = "random" + random.nextInt(numDocs / 20); + Map> groupToCounts = searchTermToGroupCounts.get(content); + if (groupToCounts == null) { + // Groups sort always DOCID asc... + searchTermToGroupCounts.put(content, groupToCounts = new LinkedHashMap>()); + contentStrings.add(content); + } + + SortedSet countsVals = groupToCounts.get(groupValue); + if (countsVals == null) { + groupToCounts.put(groupValue, countsVals = new TreeSet(new Comparator() { + + public int compare(String a, String b) { + if (a == b) { + return 0; + } else if (a == null) { + return -1; + } else if (b == null) { + return 1; + } else { + return a.compareTo(b); + } + } + + })); + } + countsVals.add(countValue); + + Document doc = new Document(); + doc.add(new Field("id", String.format("%05d", i), StringField.TYPE_UNSTORED)); + if (groupValue != null) { + addField(doc, groupField, groupValue, dvType); + } + if (countValue != null) { + addField(doc, countField, countValue, dvType); + } + doc.add(new Field("content", content, TextField.TYPE_UNSTORED)); + w.addDocument(doc); + } + + DirectoryReader reader = w.getReader(); + w.close(); + return new IndexContext(dir, reader, dvType, searchTermToGroupCounts, contentStrings.toArray(new String[contentStrings.size()])); + } + + class IndexContext { + + final Directory directory; + final DirectoryReader indexReader; + final DocValues.Type dvType; + final Map>> searchTermToGroupCounts; + final String[] contentStrings; + + IndexContext(Directory directory, DirectoryReader indexReader, DocValues.Type dvType, + Map>> searchTermToGroupCounts, String[] contentStrings) { + this.directory = directory; + this.indexReader = indexReader; + this.dvType = dvType; + this.searchTermToGroupCounts = searchTermToGroupCounts; + this.contentStrings = contentStrings; + } + } + + /*class GroupedDoc implements Comparable { + + private final String id; + private final String group; + + public int compareTo(GroupedDoc other) { + throw new UnsupportedOperationException(); + } + }*/ + +} Index: modules/grouping/src/java/org/apache/lucene/search/grouping/term/TermDistinctValuesCollector.java IDEA additional info: Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP <+>UTF-8 =================================================================== --- modules/grouping/src/java/org/apache/lucene/search/grouping/term/TermDistinctValuesCollector.java (revision ) +++ modules/grouping/src/java/org/apache/lucene/search/grouping/term/TermDistinctValuesCollector.java (revision ) @@ -0,0 +1,134 @@ +package org.apache.lucene.search.grouping.term; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.index.AtomicReaderContext; +import org.apache.lucene.search.FieldCache; +import org.apache.lucene.search.grouping.AbstractDistinctValuesCollector; +import org.apache.lucene.search.grouping.SearchGroup; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.SentinelIntSet; + +import java.io.IOException; +import java.util.*; + +/** + * A term based implementation of {@link org.apache.lucene.search.grouping.AbstractDistinctValuesCollector} that relies on {@link FieldCache.DocTermsIndex} + * to count the distinct values per group. + * + * @lucene.experimental + */ +public class TermDistinctValuesCollector extends AbstractDistinctValuesCollector { + + private final String groupField; + private final String countField; + private final List groups; + private final SentinelIntSet ordSet; + private final GroupCount groupCounts[]; + private final BytesRef spare = new BytesRef(); + + private FieldCache.DocTermsIndex groupFieldTermIndex; + private FieldCache.DocTermsIndex countFieldTermIndex; + + /** + * Constructs {@link TermDistinctValuesCollector} instance. + * + * @param groupField The field to group by + * @param countField The field to count distinct values for + * @param groups The top N groups, collected during the first phase search + */ + public TermDistinctValuesCollector(String groupField, String countField, Collection> groups) { + this.groupField = groupField; + this.countField = countField; + this.groups = new ArrayList(groups.size()); + for (SearchGroup group : groups) { + this.groups.add(new GroupCount(group.groupValue)); + } + ordSet = new SentinelIntSet(groups.size(), -1); + groupCounts = new GroupCount[ordSet.keys.length]; + } + + public void collect(int doc) throws IOException { + int slot = ordSet.find(groupFieldTermIndex.getOrd(doc)); + if (slot < 0) { + return; + } + + GroupCount gc = groupCounts[slot]; + int countOrd = countFieldTermIndex.getOrd(doc); + if (doesNotContainsOrd(countOrd, gc.ords)) { + if (countOrd == 0) { + gc.uniqueValues.add(null); + } else { + gc.uniqueValues.add(countFieldTermIndex.lookup(countOrd, new BytesRef())); + } + + gc.ords = Arrays.copyOf(gc.ords, gc.ords.length + 1); + gc.ords[gc.ords.length - 1] = countOrd; + if (gc.ords.length > 1) { + Arrays.sort(gc.ords); + } + } + } + + private boolean doesNotContainsOrd(int ord, int[] ords) { + if (ords.length == 0) { + return true; + } else if (ords.length == 1) { + return ord != ords[0]; + } + return Arrays.binarySearch(ords, ord) < 0; + } + + public List getGroups() { + return groups; + } + + public void setNextReader(AtomicReaderContext context) throws IOException { + groupFieldTermIndex = FieldCache.DEFAULT.getTermsIndex(context.reader(), groupField); + countFieldTermIndex = FieldCache.DEFAULT.getTermsIndex(context.reader(), countField); + + ordSet.clear(); + for (GroupCount group : groups) { + int groupOrd = group.groupValue == null ? 0 : groupFieldTermIndex.binarySearchLookup(group.groupValue, spare); + if (groupOrd < 0) { + continue; + } + + groupCounts[ordSet.put(groupOrd)] = group; + group.ords = new int[group.uniqueValues.size()]; + int i = 0; + for (BytesRef value : group.uniqueValues) { + int countOrd = value == null ? 0 : countFieldTermIndex.binarySearchLookup(value, spare); + if (countOrd >= 0) { + group.ords[i++] = countOrd; + } + } + } + } + + static class GroupCount extends AbstractDistinctValuesCollector.GroupCount { + + int[] ords; + + GroupCount(BytesRef groupValue) { + super(groupValue); + } + } + +} Index: solr/core/src/test/GroupCountCollectorRunner.java IDEA additional info: Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP <+>UTF-8 =================================================================== --- solr/core/src/test/GroupCountCollectorRunner.java (revision ) +++ solr/core/src/test/GroupCountCollectorRunner.java (revision ) @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.search.*; +import org.apache.lucene.search.grouping.AbstractDistinctValuesCollector; +import org.apache.lucene.search.grouping.term.TermDistinctValuesCollector; +import org.apache.lucene.search.grouping.term.TermFirstPassGroupingCollector; +import org.apache.lucene.search.grouping.term.TermSecondPassGroupingCollector; +import org.apache.lucene.store.FSDirectory; +import org.apache.lucene.util.BytesRef; + +import java.io.File; + +/** + * @author Martijn van Groningen + */ +public class GroupCountCollectorRunner { + + public static void main(String[] args) throws Exception { + Sort groupSort = new Sort(new SortField("accoid", SortField.Type.STRING_VAL)); + Sort sortWithinGroup = new Sort(new SortField("price", SortField.Type.LONG)); + + IndexReader indexReader = IndexReader.open(FSDirectory.open(new File("/home/mvg/temp/travel/data/index"))); + IndexSearcher searcher = new IndexSearcher(indexReader); + TermFirstPassGroupingCollector first = + new TermFirstPassGroupingCollector("accoid", groupSort, 10); + searcher.search(new MatchAllDocsQuery(), first); + + TermDistinctValuesCollector countCollector = new TermDistinctValuesCollector("accoid", "touroperator", first.getTopGroups(0, false)); + searcher.search(new MatchAllDocsQuery(), countCollector); + + TermSecondPassGroupingCollector second = + new TermSecondPassGroupingCollector("accoid", first.getTopGroups(0, false), groupSort, sortWithinGroup, 1, false, false, false); + searcher.search(new MatchAllDocsQuery(), second); + + long startTime = System.currentTimeMillis(); + second = new TermSecondPassGroupingCollector("accoid", first.getTopGroups(0, false), groupSort, sortWithinGroup, 1, false, false, false); + searcher.search(new MatchAllDocsQuery(), second); + System.out.println("Second took: " + (System.currentTimeMillis() - startTime)); + + startTime = System.currentTimeMillis(); + countCollector = new TermDistinctValuesCollector("accoid", "touroperator", first.getTopGroups(0, false)); + searcher.search(new MatchAllDocsQuery(), countCollector); + System.out.println("Count took: " + (System.currentTimeMillis() - startTime)); + + for (AbstractDistinctValuesCollector.GroupCount groupCount : countCollector.getGroups()) { + System.out.println(groupCount.groupValue.utf8ToString() + " : " + groupCount.uniqueValues.size()); + } + } + +} Index: modules/grouping/src/java/org/apache/lucene/search/grouping/dv/DVDistinctValuesCollector.java IDEA additional info: Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP <+>UTF-8 =================================================================== --- modules/grouping/src/java/org/apache/lucene/search/grouping/dv/DVDistinctValuesCollector.java (revision ) +++ modules/grouping/src/java/org/apache/lucene/search/grouping/dv/DVDistinctValuesCollector.java (revision ) @@ -0,0 +1,296 @@ +package org.apache.lucene.search.grouping.dv; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.index.AtomicReaderContext; +import org.apache.lucene.index.DocValues; +import org.apache.lucene.search.grouping.AbstractDistinctValuesCollector; +import org.apache.lucene.search.grouping.SearchGroup; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.SentinelIntSet; +import org.apache.lucene.index.DocValues.Type; // javadocs + +import java.io.IOException; +import java.util.*; + +/** + * Docvalues implementation of {@link org.apache.lucene.search.grouping.AbstractDistinctValuesCollector}. + * + * @lucene.experimental + */ +public abstract class DVDistinctValuesCollector extends AbstractDistinctValuesCollector { + + final String groupField; + final String countField; + final boolean diskResident; + final Type valueType; + + DVDistinctValuesCollector(String groupField, String countField, boolean diskResident, Type valueType) { + this.groupField = groupField; + this.countField = countField; + this.diskResident = diskResident; + this.valueType = valueType; + } + + /** + * Constructs a docvalues based implementation of {@link org.apache.lucene.search.grouping.AbstractDistinctValuesCollector} based on the specified + * type. + * + * @param groupField The field to group by + * @param countField The field to count distinct values for + * @param groups The top N groups, collected during the first phase search + * @param diskResident Whether the values to group and count by should be disk resident + * @param type The {@link Type} which is used to select a concrete implementation + * @return a docvalues based distinct count collector + */ + @SuppressWarnings("unchecked") + public static DVDistinctValuesCollector> create(String groupField, String countField, Collection> groups, boolean diskResident, Type type) { + switch (type) { + case VAR_INTS: + case FIXED_INTS_8: + case FIXED_INTS_16: + case FIXED_INTS_32: + case FIXED_INTS_64: + // Type erasure b/c otherwise we have inconvertible types... + return (DVDistinctValuesCollector) new NonSorted.Lng(groupField, countField, (Collection) groups, diskResident, type); + case FLOAT_32: + case FLOAT_64: + // Type erasure b/c otherwise we have inconvertible types... + return (DVDistinctValuesCollector) new NonSorted.Dbl(groupField, countField, (Collection) groups, diskResident, type); + case BYTES_FIXED_STRAIGHT: + case BYTES_FIXED_DEREF: + case BYTES_VAR_STRAIGHT: + case BYTES_VAR_DEREF: + // Type erasure b/c otherwise we have inconvertible types... + return (DVDistinctValuesCollector) new NonSorted.BR(groupField, countField, (Collection) groups, diskResident, type); + case BYTES_VAR_SORTED: + case BYTES_FIXED_SORTED: + // Type erasure b/c otherwise we have inconvertible types... + return (DVDistinctValuesCollector) new Sorted.BR(groupField, countField, (Collection) groups, diskResident, type); + default: + throw new IllegalArgumentException(String.format("ValueType %s not supported", type)); + } + } + + + static abstract class NonSorted extends DVDistinctValuesCollector { + + final Map groupMap = new LinkedHashMap(); + + DocValues.Source groupFieldSource; + DocValues.Source countFieldSource; + + NonSorted(String groupField, String countField, boolean diskResident, Type valueType) { + super(groupField, countField, diskResident, valueType); + } + + public List getGroups() { + return new ArrayList(groupMap.values()); + } + + public void setNextReader(AtomicReaderContext context) throws IOException { + groupFieldSource = retrieveSource(groupField, context); + countFieldSource = retrieveSource(countField, context); + } + + private DocValues.Source retrieveSource(String fieldName, AtomicReaderContext context) throws IOException { + DocValues groupFieldDv = context.reader().docValues(fieldName); + if (groupFieldDv != null) { + return diskResident ? groupFieldDv.getDirectSource() : groupFieldDv.getSource(); + } else { + return DocValues.getDefaultSource(valueType); + } + } + + static class Dbl extends NonSorted { + + Dbl(String groupField, String countField, Collection> groups, boolean diskResident, Type valueType) { + super(groupField, countField, diskResident, valueType); + for (SearchGroup group : groups) { + groupMap.put(group.groupValue, new GroupCount(group.groupValue)); + } + } + + public void collect(int doc) throws IOException { + GroupCount groupCount = groupMap.get(groupFieldSource.getFloat(doc)); + if (groupCount != null) { + groupCount.uniqueValues.add(countFieldSource.getFloat(doc)); + } + } + + } + + static class Lng extends NonSorted { + + Lng(String groupField, String countField, Collection> groups, boolean diskResident, Type valueType) { + super(groupField, countField, diskResident, valueType); + for (SearchGroup group : groups) { + groupMap.put(group.groupValue, new GroupCount(group.groupValue)); + } + } + + public void collect(int doc) throws IOException { + GroupCount groupCount = groupMap.get(groupFieldSource.getInt(doc)); + if (groupCount != null) { + groupCount.uniqueValues.add(countFieldSource.getInt(doc)); + } + } + + } + + static class BR extends NonSorted { + + private final BytesRef spare = new BytesRef(); + + BR(String groupField, String countField, Collection> groups, boolean diskResident, Type valueType) { + super(groupField, countField, diskResident, valueType); + for (SearchGroup group : groups) { + groupMap.put(group.groupValue, new GroupCount(group.groupValue)); + } + } + + public void collect(int doc) throws IOException { + GroupCount groupCount = groupMap.get(groupFieldSource.getBytes(doc, spare)); + if (groupCount != null) { + BytesRef countValue = countFieldSource.getBytes(doc, spare); + if (!groupCount.uniqueValues.contains(countValue)) { + groupCount.uniqueValues.add(BytesRef.deepCopyOf(countValue)); + } + } + } + + } + + static class GroupCount extends AbstractDistinctValuesCollector.GroupCount { + + GroupCount(Comparable groupValue) { + super(groupValue); + } + + } + + } + + + static abstract class Sorted extends DVDistinctValuesCollector { + + final SentinelIntSet ordSet; + final GroupCount groupCounts[]; + final List groups = new ArrayList(); + + DocValues.SortedSource groupFieldSource; + DocValues.SortedSource countFieldSource; + + Sorted(String groupField, String countField, int groupSize, boolean diskResident, Type valueType) { + super(groupField, countField, diskResident, valueType); + ordSet = new SentinelIntSet(groupSize, -1); + groupCounts = new GroupCount[ordSet.keys.length]; + } + + public List getGroups() { + return groups; + } + + public void setNextReader(AtomicReaderContext context) throws IOException { + groupFieldSource = retrieveSortedSource(groupField, context); + countFieldSource = retrieveSortedSource(countField, context); + ordSet.clear(); + } + + private DocValues.SortedSource retrieveSortedSource(String field, AtomicReaderContext context) throws IOException { + DocValues countFieldDv = context.reader().docValues(field); + if (countFieldDv != null) { + return diskResident ? countFieldDv.getDirectSource().asSortedSource() : countFieldDv.getSource().asSortedSource(); + } else { + return DocValues.getDefaultSortedSource(valueType, context.reader().maxDoc()); + } + } + + static class BR extends Sorted { + + final BytesRef spare = new BytesRef(); + + BR(String groupField, String countField, Collection> searchGroups, boolean diskResident, Type valueType) { + super(groupField, countField, searchGroups.size(), diskResident, valueType); + for (SearchGroup group : searchGroups) { + this.groups.add(new GroupCount(group.groupValue)); + } + } + + public void collect(int doc) throws IOException { + int slot = ordSet.find(groupFieldSource.ord(doc)); + if (slot < 0) { + return; + } + + GroupCount gc = groupCounts[slot]; + int countOrd = countFieldSource.ord(doc); + if (doesNotContainsOrd(countOrd, gc.ords)) { + gc.uniqueValues.add(countFieldSource.getByOrd(countOrd, new BytesRef())); + gc.ords = Arrays.copyOf(gc.ords, gc.ords.length + 1); + gc.ords[gc.ords.length - 1] = countOrd; + if (gc.ords.length > 1) { + Arrays.sort(gc.ords); + } + } + } + + private boolean doesNotContainsOrd(int ord, int[] ords) { + if (ords.length == 0) { + return true; + } else if (ords.length == 1) { + return ord != ords[0]; + } + return Arrays.binarySearch(ords, ord) < 0; + } + + @Override + public void setNextReader(AtomicReaderContext context) throws IOException { + super.setNextReader(context); + for (GroupCount group : groups) { + int groupOrd = groupFieldSource.getOrdByValue((BytesRef) group.groupValue, spare); + if (groupOrd < 0) { + continue; + } + + groupCounts[ordSet.put(groupOrd)] = group; + group.ords = new int[group.uniqueValues.size()]; + int i = 0; + for (Comparable value : group.uniqueValues) { + int countOrd = countFieldSource.getOrdByValue((BytesRef) value, spare); + if (countOrd >= 0) { + group.ords[i++] = countOrd; + } + } + } + } + } + + static class GroupCount extends AbstractDistinctValuesCollector.GroupCount { + + int[] ords; + + GroupCount(Comparable groupValue) { + super(groupValue); + } + + } + + } + +} Index: modules/grouping/src/java/org/apache/lucene/search/grouping/function/FunctionDistinctValuesCollector.java IDEA additional info: Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP <+>UTF-8 =================================================================== --- modules/grouping/src/java/org/apache/lucene/search/grouping/function/FunctionDistinctValuesCollector.java (revision ) +++ modules/grouping/src/java/org/apache/lucene/search/grouping/function/FunctionDistinctValuesCollector.java (revision ) @@ -0,0 +1,86 @@ +package org.apache.lucene.search.grouping.function; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.index.AtomicReaderContext; +import org.apache.lucene.queries.function.FunctionValues; +import org.apache.lucene.queries.function.ValueSource; +import org.apache.lucene.search.grouping.AbstractDistinctValuesCollector; +import org.apache.lucene.search.grouping.SearchGroup; +import org.apache.lucene.util.mutable.MutableValue; + +import java.io.IOException; +import java.util.*; + +/** + * Function based implementation of {@link org.apache.lucene.search.grouping.AbstractDistinctValuesCollector}. + * + * @lucene.experimental + */ +public class FunctionDistinctValuesCollector extends AbstractDistinctValuesCollector { + + private final Map vsContext; + private final ValueSource groupSource; + private final ValueSource countSource; + private final Map groupMap; + + private FunctionValues.ValueFiller groupFiller; + private FunctionValues.ValueFiller countFiller; + private MutableValue groupMval; + private MutableValue countMval; + + public FunctionDistinctValuesCollector(Map vsContext, ValueSource groupSource, ValueSource countSource, Collection> groups) { + this.vsContext = vsContext; + this.groupSource = groupSource; + this.countSource = countSource; + groupMap = new HashMap(); + for (SearchGroup group : groups) { + groupMap.put(group.groupValue, new GroupCount(group.groupValue)); + } + } + + public List getGroups() { + return new ArrayList(groupMap.values()); + } + + public void collect(int doc) throws IOException { + groupFiller.fillValue(doc); + GroupCount groupCount = groupMap.get(groupMval); + if (groupCount != null) { + countFiller.fillValue(doc); + groupCount.uniqueValues.add(countMval.duplicate()); + } + } + + public void setNextReader(AtomicReaderContext context) throws IOException { + FunctionValues values = groupSource.getValues(vsContext, context); + groupFiller = values.getValueFiller(); + groupMval = groupFiller.getValue(); + values = countSource.getValues(vsContext, context); + countFiller = values.getValueFiller(); + countMval = countFiller.getValue(); + } + + static class GroupCount extends AbstractDistinctValuesCollector.GroupCount { + + GroupCount(MutableValue groupValue) { + super(groupValue); + } + + } +}