Index: lucene/src/java/org/apache/lucene/util/BytesRef.java
--- lucene/src/java/org/apache/lucene/util/BytesRef.java Wed May 11 18:19:24 2011 -0400
+++ lucene/src/java/org/apache/lucene/util/BytesRef.java Wed May 11 18:35:35 2011 -0400
@@ -19,9 +19,6 @@
import java.util.Comparator;
import java.io.UnsupportedEncodingException;
-import java.io.ObjectInput;
-import java.io.ObjectOutput;
-import java.io.IOException;
/** Represents byte[], as a slice (offset + length) into an
* existing byte[].
@@ -192,6 +189,9 @@
@Override
public boolean equals(Object other) {
+ if (other == null) {
+ return false;
+ }
return this.bytesEquals((BytesRef) other);
}
Index: modules/grouping/build.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ modules/grouping/build.xml Wed May 11 18:35:35 2011 -0400
@@ -0,0 +1,13 @@
+
+
+
+ Collectors for grouping search results
+
+
+
+
+
+
+
+
+
Index: modules/grouping/src/java/org/apache/lucene/search/grouping/CachingCollector.java
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ modules/grouping/src/java/org/apache/lucene/search/grouping/CachingCollector.java Wed May 11 18:35:35 2011 -0400
@@ -0,0 +1,252 @@
+package org.apache.lucene.search.grouping;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.lucene.index.IndexReader.AtomicReaderContext;
+import org.apache.lucene.search.Collector;
+import org.apache.lucene.search.Scorer;
+import org.apache.lucene.util.RamUsageEstimator;
+
+/**
+ * Caches all docs, and optionally also scores, coming from
+ * a search, and is then able to replay them to another
+ * collector. You specify the max RAM this class may use.
+ * Once the collection is done, call {@link #isCached}. If
+ * this returns true, you can use {@link #replay} against a
+ * new collector. If it returns false, this means too much
+ * RAM was required and you must instead re-run the original
+ * search.
+ *
+ *
NOTE: this class consumes 4 (or 8 bytes, if
+ * scoring is cached) per collected document. If the result
+ * set is large this can easily be a very substantial amount
+ * of RAM!
+ *
+ * @lucene.experimental
+ */
+public class CachingCollector extends Collector {
+
+ private static class SegStart {
+ public final AtomicReaderContext readerContext;
+ public final int end;
+
+ public SegStart(AtomicReaderContext readerContext, int end) {
+ this.readerContext = readerContext;
+ this.end = end;
+ }
+ }
+
+ // TODO: would be nice if a collector defined a
+ // needsScores() method so we can specialize / do checks
+ // up front:
+ private final Collector other;
+ private final int maxDocsToCache;
+
+ private final Scorer cachedScorer;
+ private final List cachedDocs;
+ private final List cachedScores;
+ private final List cachedSegs = new ArrayList();
+
+ private Scorer scorer;
+ private int[] curDocs;
+ private float[] curScores;
+ private int upto;
+ private AtomicReaderContext lastReaderContext;
+ private float score;
+ private int base;
+
+ public CachingCollector(Collector other, boolean cacheScores, double maxRAMMB) {
+ this.other = other;
+ if (cacheScores) {
+ cachedScorer = new Scorer(null) {
+ @Override
+ public float score() {
+ return score;
+ }
+
+ @Override
+ public int advance(int target) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public int docID() {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public float freq() {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public int nextDoc() {
+ throw new UnsupportedOperationException();
+ }
+ };
+ cachedScores = new ArrayList();
+ curScores = new float[128];
+ cachedScores.add(curScores);
+ } else {
+ cachedScorer = null;
+ cachedScores = null;
+ }
+ cachedDocs = new ArrayList();
+ curDocs = new int[128];
+ cachedDocs.add(curDocs);
+
+ final int bytesPerDoc;
+ if (curScores != null) {
+ bytesPerDoc = RamUsageEstimator.NUM_BYTES_INT + RamUsageEstimator.NUM_BYTES_FLOAT;
+ } else {
+ bytesPerDoc = RamUsageEstimator.NUM_BYTES_INT;
+ }
+ maxDocsToCache = (int) ((maxRAMMB * 1024 * 1024)/bytesPerDoc);
+ }
+
+ @Override
+ public void setScorer(Scorer scorer) throws IOException {
+ this.scorer = scorer;
+ other.setScorer(cachedScorer);
+ }
+
+ @Override
+ public boolean acceptsDocsOutOfOrder() {
+ return other.acceptsDocsOutOfOrder();
+ }
+
+ @Override
+ public void collect(int doc) throws IOException {
+
+ if (curDocs == null) {
+ // Cache was too large
+ if (curScores != null) {
+ score = scorer.score();
+ }
+ other.collect(doc);
+ return;
+ }
+
+ if (upto == curDocs.length) {
+ base += upto;
+ final int nextLength;
+ // Max out at 512K arrays:
+ if (curDocs.length < 524288) {
+ nextLength = 8*curDocs.length;
+ } else {
+ nextLength = curDocs.length;
+ }
+
+ if (base + nextLength > maxDocsToCache) {
+ // Too many docs to collect -- clear cache
+ curDocs = null;
+ if (curScores != null) {
+ score = scorer.score();
+ }
+ other.collect(doc);
+ cachedDocs.clear();
+ cachedScores.clear();
+ return;
+ }
+ curDocs = new int[nextLength];
+ cachedDocs.add(curDocs);
+ if (curScores != null) {
+ curScores = new float[nextLength];
+ cachedScores.add(curScores);
+ }
+ upto = 0;
+ }
+ curDocs[upto] = doc;
+ // TODO: maybe specialize private subclass so we don't
+ // null check per collect...
+ if (curScores != null) {
+ score = curScores[upto] = scorer.score();
+ }
+ upto++;
+ other.collect(doc);
+ }
+
+ public boolean isCached() {
+ return curDocs != null;
+ }
+
+ @Override
+ public void setNextReader(AtomicReaderContext context) throws IOException {
+ other.setNextReader(context);
+ if (lastReaderContext != null) {
+ cachedSegs.add(new SegStart(lastReaderContext, base+upto));
+ }
+ lastReaderContext = context;
+ }
+
+ private final static int[] EMPTY_INT_ARRAY = new int[0];
+
+ @Override
+ public String toString() {
+ if (isCached()) {
+ return "CachingCollector (" + (base+upto) + " docs " + (curScores != null ? " & scores" : "") + " cached)";
+ } else {
+ return "CachingCollector (cache was cleared)";
+ }
+ }
+
+ public void replay(Collector other) throws IOException {
+ if (!isCached()) {
+ throw new IllegalStateException("cannot replay: cache was cleared because too much RAM was required");
+ }
+ //System.out.println("CC: replay totHits=" + (upto + base));
+ if (lastReaderContext != null) {
+ cachedSegs.add(new SegStart(lastReaderContext, base+upto));
+ lastReaderContext = null;
+ }
+ final int uptoSav = upto;
+ final int baseSav = base;
+ try {
+ upto = 0;
+ base = 0;
+ int chunkUpto = 0;
+ other.setScorer(cachedScorer);
+ curDocs = EMPTY_INT_ARRAY;
+ for(SegStart seg : cachedSegs) {
+ other.setNextReader(seg.readerContext);
+ while(base+upto < seg.end) {
+ if (upto == curDocs.length) {
+ base += curDocs.length;
+ curDocs = cachedDocs.get(chunkUpto);
+ if (curScores != null) {
+ curScores = cachedScores.get(chunkUpto);
+ }
+ chunkUpto++;
+ upto = 0;
+ }
+ if (curScores != null) {
+ score = curScores[upto];
+ }
+ other.collect(curDocs[upto++]);
+ }
+ }
+ } finally {
+ upto = uptoSav;
+ base = baseSav;
+ }
+ }
+}
Index: modules/grouping/src/java/org/apache/lucene/search/grouping/FirstPassGroupingCollector.java
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ modules/grouping/src/java/org/apache/lucene/search/grouping/FirstPassGroupingCollector.java Wed May 11 18:35:35 2011 -0400
@@ -0,0 +1,362 @@
+package org.apache.lucene.search.grouping;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.TreeSet;
+
+import org.apache.lucene.index.IndexReader.AtomicReaderContext;
+import org.apache.lucene.search.Collector;
+import org.apache.lucene.search.FieldCache;
+import org.apache.lucene.search.FieldComparator;
+import org.apache.lucene.search.Scorer;
+import org.apache.lucene.search.Sort;
+import org.apache.lucene.search.SortField;
+import org.apache.lucene.util.BytesRef;
+
+/** FirstPassGroupingCollector is the first of two passes necessary
+ * to collected grouped hits. This pass gathers the top N sorted
+ * groups.
+ *
+ * @lucene.experimental
+ */
+
+public class FirstPassGroupingCollector extends Collector {
+
+ private final String groupField;
+ private final Sort groupSort;
+ private final FieldComparator[] comparators;
+ private final int[] reversed;
+ private final int topNGroups;
+ private final HashMap groupMap;
+ private final BytesRef scratchBytesRef = new BytesRef();
+ private final int compIDXEnd;
+
+ // Set once we reach topNGroups unique groups:
+ private TreeSet orderedGroups;
+ private int docBase;
+ private int spareSlot;
+ private FieldCache.DocTermsIndex index;
+
+ /**
+ * Create the first pass collector.
+ *
+ * @param groupField The field used to group
+ * documents. This field must be single-valued and
+ * indexed (FieldCache is used to access its value
+ * per-document).
+ * @param groupSort The {@link Sort} used to sort the
+ * groups. The top sorted document within each group
+ * according to groupSort, determines how that group
+ * sorts against other groups. This must be non-null,
+ * ie, if you want to groupSort by relevance use
+ * Sort.RELEVANCE.
+ * @param topNGroups How many top groups to keep.
+ */
+ public FirstPassGroupingCollector(String groupField, Sort groupSort, int topNGroups) throws IOException {
+ if (topNGroups < 1) {
+ throw new IllegalArgumentException("topNGroups must be >= 1 (got " + topNGroups + ")");
+ }
+
+ this.groupField = groupField;
+ // TODO: allow null groupSort to mean "by relevance",
+ // and specialize it?
+ this.groupSort = groupSort;
+
+ this.topNGroups = topNGroups;
+
+ final SortField[] sortFields = groupSort.getSort();
+ comparators = new FieldComparator[sortFields.length];
+ compIDXEnd = comparators.length - 1;
+ reversed = new int[sortFields.length];
+ for (int i = 0; i < sortFields.length; i++) {
+ final SortField sortField = sortFields[i];
+
+ // use topNGroups + 1 so we have a spare slot to use for comparing (tracked by this.spareSlot):
+ comparators[i] = sortField.getComparator(topNGroups + 1, i);
+ reversed[i] = sortField.getReverse() ? -1 : 1;
+ }
+
+ spareSlot = topNGroups;
+ groupMap = new HashMap(topNGroups);
+ }
+
+ /** Returns top groups, starting from offset. This may
+ * return null, if no groups were collected, or if the
+ * number of unique groups collected is <= offset. */
+ public Collection getTopGroups(int groupOffset, boolean fillFields) {
+
+ //System.out.println("FP.getTopGroups groupOffset=" + groupOffset + " fillFields=" + fillFields + " groupMap.size()=" + groupMap.size());
+
+ if (groupOffset < 0) {
+ throw new IllegalArgumentException("groupOffset must be >= 0 (got " + groupOffset + ")");
+ }
+
+ if (groupMap.size() <= groupOffset) {
+ return null;
+ }
+
+ if (orderedGroups == null) {
+ buildSortedSet();
+ }
+
+ final Collection result = new ArrayList();
+ int upto = 0;
+ final int sortFieldCount = groupSort.getSort().length;
+ for(CollectedSearchGroup group : orderedGroups) {
+ if (upto++ < groupOffset) {
+ continue;
+ }
+ //System.out.println(" group=" + (group.groupValue == null ? "null" : group.groupValue.utf8ToString()));
+ SearchGroup searchGroup = new SearchGroup();
+ searchGroup.groupValue = group.groupValue;
+ if (fillFields) {
+ searchGroup.sortValues = new Comparable[sortFieldCount];
+ for(int sortFieldIDX=0;sortFieldIDX 0) {
+ // Definitely competitive; set remaining comparators:
+ for (int compIDX2=compIDX+1; compIDX2 comparator = new Comparator() {
+ public int compare(CollectedSearchGroup o1, CollectedSearchGroup o2) {
+ for (int compIDX = 0;; compIDX++) {
+ FieldComparator fc = comparators[compIDX];
+ final int c = reversed[compIDX] * fc.compare(o1.comparatorSlot, o2.comparatorSlot);
+ if (c != 0) {
+ return c;
+ } else if (compIDX == compIDXEnd) {
+ return o1.topDoc - o2.topDoc;
+ }
+ }
+ }
+ };
+
+ orderedGroups = new TreeSet(comparator);
+ orderedGroups.addAll(groupMap.values());
+ assert orderedGroups.size() > 0;
+
+ for (FieldComparator fc : comparators) {
+ fc.setBottom(orderedGroups.last().comparatorSlot);
+ }
+ }
+
+ @Override
+ public boolean acceptsDocsOutOfOrder() {
+ return false;
+ }
+
+ @Override
+ public void setNextReader(AtomicReaderContext readerContext) throws IOException {
+ docBase = readerContext.docBase;
+ index = FieldCache.DEFAULT.getTermsIndex(readerContext.reader, groupField);
+
+ for (int i=0; i groupMap;
+
+ private FieldCache.DocTermsIndex index;
+ private final String groupField;
+ private final int maxDocsPerGroup;
+ private final SentinelIntSet ordSet;
+ private final SearchGroupDocs[] groupDocs;
+ private final BytesRef spareBytesRef = new BytesRef();
+ private final Collection groups;
+ private final Sort withinGroupSort;
+ private final Sort groupSort;
+
+ private int totalHitCount;
+ private int totalGroupedHitCount;
+
+ public SecondPassGroupingCollector(String groupField, Collection groups, Sort groupSort, Sort withinGroupSort,
+ int maxDocsPerGroup, boolean getScores, boolean getMaxScores, boolean fillSortFields)
+ throws IOException {
+
+ //System.out.println("SP init");
+ if (groups.size() == 0) {
+ throw new IllegalArgumentException("no groups to collect (groups.size() is 0)");
+ }
+
+ this.groupSort = groupSort;
+ this.withinGroupSort = withinGroupSort;
+ this.groups = groups;
+ this.groupField = groupField;
+ this.maxDocsPerGroup = maxDocsPerGroup;
+
+ groupMap = new HashMap(groups.size());
+
+ for (SearchGroup group : groups) {
+ //System.out.println(" prep group=" + (group.groupValue == null ? "null" : group.groupValue.utf8ToString()));
+ final TopDocsCollector collector;
+ if (withinGroupSort == null) {
+ // Sort by score
+ collector = TopScoreDocCollector.create(maxDocsPerGroup, true);
+ } else {
+ // Sort by fields
+ collector = TopFieldCollector.create(withinGroupSort, maxDocsPerGroup, fillSortFields, getScores, getMaxScores, true);
+ }
+ groupMap.put(group.groupValue,
+ new SearchGroupDocs(group.groupValue,
+ collector));
+ }
+
+ ordSet = new SentinelIntSet(groupMap.size(), -1);
+ groupDocs = new SearchGroupDocs[ordSet.keys.length];
+ }
+
+ @Override
+ public void setScorer(Scorer scorer) throws IOException {
+ for (SearchGroupDocs group : groupMap.values()) {
+ group.collector.setScorer(scorer);
+ }
+ }
+
+ @Override
+ public void collect(int doc) throws IOException {
+ final int slot = ordSet.find(index.getOrd(doc));
+ //System.out.println("SP.collect doc=" + doc + " slot=" + slot);
+ totalHitCount++;
+ if (slot >= 0) {
+ totalGroupedHitCount++;
+ groupDocs[slot].collector.collect(doc);
+ }
+ }
+
+ @Override
+ public void setNextReader(AtomicReaderContext readerContext) throws IOException {
+ //System.out.println("SP.setNextReader");
+ for (SearchGroupDocs group : groupMap.values()) {
+ group.collector.setNextReader(readerContext);
+ }
+ index = FieldCache.DEFAULT.getTermsIndex(readerContext.reader, groupField);
+
+ // Rebuild ordSet
+ ordSet.clear();
+ for (SearchGroupDocs group : groupMap.values()) {
+ //System.out.println(" group=" + (group.groupValue == null ? "null" : group.groupValue.utf8ToString()));
+ int ord = group.groupValue == null ? 0 : index.binarySearchLookup(group.groupValue, spareBytesRef);
+ if (ord >= 0) {
+ groupDocs[ordSet.put(ord)] = group;
+ }
+ }
+ }
+
+ @Override
+ public boolean acceptsDocsOutOfOrder() {
+ return false;
+ }
+
+ public TopGroups getTopGroups(int withinGroupOffset) {
+ final GroupDocs[] groupDocsResult = new GroupDocs[groups.size()];
+
+ int groupIDX = 0;
+ for(SearchGroup group : groups) {
+ final SearchGroupDocs groupDocs = groupMap.get(group.groupValue);
+ final TopDocs topDocs = groupDocs.collector.topDocs(withinGroupOffset, maxDocsPerGroup);
+ groupDocsResult[groupIDX++] = new GroupDocs(topDocs.getMaxScore(),
+ topDocs.totalHits,
+ topDocs.scoreDocs,
+ groupDocs.groupValue,
+ group.sortValues);
+ }
+
+ return new TopGroups(groupSort.getSort(),
+ withinGroupSort == null ? null : withinGroupSort.getSort(),
+ totalHitCount, totalGroupedHitCount, groupDocsResult);
+ }
+}
+
+
+// TODO: merge with SearchGroup or not?
+// ad: don't need to build a new hashmap
+// disad: blows up the size of SearchGroup if we need many of them, and couples implementations
+class SearchGroupDocs {
+ public final BytesRef groupValue;
+ public final TopDocsCollector collector;
+
+ public SearchGroupDocs(BytesRef groupValue, TopDocsCollector collector) {
+ this.groupValue = groupValue;
+ this.collector = collector;
+ }
+}
Index: modules/grouping/src/java/org/apache/lucene/search/grouping/SentinelIntSet.java
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ modules/grouping/src/java/org/apache/lucene/search/grouping/SentinelIntSet.java Wed May 11 18:35:35 2011 -0400
@@ -0,0 +1,134 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.search.grouping;
+
+import java.util.Arrays;
+
+/** A native int set where one value is reserved to mean "EMPTY" */
+public class SentinelIntSet {
+ public int[] keys;
+ public int count;
+ public final int emptyVal;
+ public int rehashCount; // the count at which a rehash should be done
+
+ public SentinelIntSet(int size, int emptyVal) {
+ this.emptyVal = emptyVal;
+ int tsize = Math.max(org.apache.lucene.util.BitUtil.nextHighestPowerOfTwo(size), 1);
+ rehashCount = tsize - (tsize>>2);
+ if (size >= rehashCount) { // should be able to hold "size" w/o rehashing
+ tsize <<= 1;
+ rehashCount = tsize - (tsize>>2);
+ }
+ keys = new int[tsize];
+ if (emptyVal != 0)
+ clear();
+ }
+
+ public void clear() {
+ Arrays.fill(keys, emptyVal);
+ count = 0;
+ }
+
+ public int hash(int key) {
+ return key;
+ }
+
+ public int size() { return count; }
+
+ /** returns the slot for this key */
+ public int getSlot(int key) {
+ assert key != emptyVal;
+ int h = hash(key);
+ int s = h & (keys.length-1);
+ if (keys[s] == key || keys[s]== emptyVal) return s;
+
+ int increment = (h>>7)|1;
+ do {
+ s = (s + increment) & (keys.length-1);
+ } while (keys[s] != key && keys[s] != emptyVal);
+ return s;
+ }
+
+ /** returns the slot for this key, or -slot-1 if not found */
+ public int find(int key) {
+ assert key != emptyVal;
+ int h = hash(key);
+ int s = h & (keys.length-1);
+ if (keys[s] == key) return s;
+ if (keys[s] == emptyVal) return -s-1;
+
+ int increment = (h>>7)|1;
+ for(;;) {
+ s = (s + increment) & (keys.length-1);
+ if (keys[s] == key) return s;
+ if (keys[s] == emptyVal) return -s-1;
+ }
+ }
+
+
+ public boolean exists(int key) {
+ return find(key) >= 0;
+ }
+
+
+ public int put(int key) {
+ int s = find(key);
+ if (s < 0) {
+ if (count >= rehashCount) {
+ rehash();
+ s = getSlot(key);
+ } else {
+ s = -s-1;
+ }
+ count++;
+ keys[s] = key;
+ putKey(key, s);
+ } else {
+ overwriteKey(key, s);
+ }
+ return s;
+ }
+
+
+ protected void putKey(int key, int slot) {}
+ protected void overwriteKey(int key, int slot) {}
+
+ protected void startRehash(int newSize) {}
+ protected void moveKey(int key, int oldSlot, int newSlot) {}
+ protected void endRehash() {}
+
+ public void rehash() {
+ int newSize = keys.length << 1;
+ startRehash(newSize);
+ int[] oldKeys = keys;
+ keys = new int[newSize];
+ if (emptyVal != 0) Arrays.fill(keys, emptyVal);
+
+ for (int i=0; i>2);
+
+ }
+
+}
Index: modules/grouping/src/java/org/apache/lucene/search/grouping/TopGroups.java
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ modules/grouping/src/java/org/apache/lucene/search/grouping/TopGroups.java Wed May 11 18:35:35 2011 -0400
@@ -0,0 +1,48 @@
+package org.apache.lucene.search.grouping;
+
+import org.apache.lucene.search.SortField;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/** Represents result returned by a grouping search.
+ *
+ * @lucene.experimental */
+public class TopGroups {
+ /** Number of documents matching the search */
+ public final int totalHitCount;
+
+ /** Number of documents grouped into the topN groups */
+ public final int totalGroupedHitCount;
+
+ /** Group results in groupSort order */
+ public final GroupDocs[] groups;
+
+ /** How groups are sorted against each other */
+ public final SortField[] groupSort;
+
+ /** How docs are sorted within each group */
+ public final SortField[] withinGroupSort;
+
+ public TopGroups(SortField[] groupSort, SortField[] withinGroupSort, int totalHitCount, int totalGroupedHitCount, GroupDocs[] groups) {
+ this.groupSort = groupSort;
+ this.withinGroupSort = withinGroupSort;
+ this.totalHitCount = totalHitCount;
+ this.totalGroupedHitCount = totalGroupedHitCount;
+ this.groups = groups;
+ }
+}
Index: modules/grouping/src/java/overview.html
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ modules/grouping/src/java/overview.html Wed May 11 18:35:35 2011 -0400
@@ -0,0 +1,105 @@
+
+
+
+This module enables search result grouping with Lucene, where hits
+with the same value in the specified single-valued group field are
+grouped together. For example, if you group by the author
+field, then all documents with the same value in the author
+field fall into a single group.
+
+Grouping requires a number of inputs:
+
+
+ - groupField: this is the field used for grouping.
+ For example, if you use the author field then each
+ group has all books by the same author. Documents that don't
+ have this field are grouped under a single group with
+ a null group value.
+
+
- groupSort: how the groups are sorted. For sorting
+ purposes, each group is "represented" by the highest-sorted
+ document according to the groupSort within it. For
+ example, if you specify "price" (ascending) then the first group
+ is the one with the lowest price book within it. Or if you
+ specify relevance group sort, then the first group is the one
+ containing the highest scoring book.
+
+
- topNGroups: how many top groups to keep. For
+ example, 10 means the top 10 groups are computed.
+
+
- groupOffset: which "slice" of top groups you want to
+ retrieve. For example, 3 means you'll get 7 groups back
+ (assuming topNGroups is 10). This is useful for
+ paging, where you might show 5 groups per page.
+
+
- withinGroupSort: how the documents within each group
+ are sorted. This can be different from the group sort.
+
+
- maxDocsPerGroup: how many top documents within each
+ group to keep.
+
+
- withinGroupOffset: which "slice" of top
+ documents you want to retrieve from each group.
+
+
+
+The implementation is two-pass: the first pass ({@link
+ org.apache.lucene.search.grouping.FirstPassGroupingCollector})
+ gathers the top groups, and the second pass ({@link
+ org.apache.lucene.search.grouping.SecondPassGroupingCollector})
+ gathers documents within those groups. If the search is costly to
+ run you may want to use the {@link
+ org.apache.lucene.search.grouping.CachingCollector} class, which
+ caches hits and can (quickly) replay them for the second pass. This
+ way you only run the query once, but you pay a RAM cost to (briefly)
+ hold all hits. Results are returned as a {@link
+ org.apache.lucene.search.grouping.TopGroups} instance.
+
+Known limitations:
+
+ - The group field must be a single-valued indexed field.
+ {@link org.apache.lucene.search.FieldCache} is used to load the {@link org.apache.lucene.search.FieldCache.DocTermsIndex} for this field.
+
- Unlike Solr's implementation, this module cannot group by
+ function query values nor by arbitrary queries.
+
- Sharding is not directly supported, though is not too
+ difficult, if you can merge the top groups and top documents per
+ group yourself.
+
+
+Typical usage looks like this (using the {@link org.apache.lucene.search.grouping.CachingCollector}):
+
+
+ FirstPassGroupingCollector c1 = new FirstPassGroupingCollector("author", groupSort, groupOffset+topNGroups);
+
+ boolean cacheScores = true;
+ double maxCacheRAMMB = 4.0;
+ CachingCollector cachedCollector = new CachingCollector(c1, cacheScores, maxCacheRAMMB);
+ s.search(new TermQuery(new Term("content", searchTerm)), cachedCollector);
+
+ Collection topGroups = c1.getTopGroups(groupOffset, fillFields);
+
+ if (topGroups == null) {
+ // No groups matched
+ return;
+ }
+
+ boolean getScores = true;
+ boolean getMaxScores = true;
+ boolean fillFields = true;
+ SecondPassGroupingCollector c2 = new SecondPassGroupingCollector("author", topGroups, groupSort, docSort, docOffset+docsPerGroup, getScores, getMaxScores, fillFields);
+
+ if (cachedCollector.isCached()) {
+ // Cache fit within maxCacheRAMMB, so we can replay it:
+ cachedCollector.replay(c2);
+ } else {
+ // Cache was too large; must re-execute query:
+ s.search(new TermQuery(new Term("content", searchTerm)), c2);
+ }
+
+ TopGroups groupsResult = c2.getTopGroups(docOffset);
+
+ // Render groupsResult...
+
+
+
+
Index: modules/grouping/src/test/org/apache/lucene/search/grouping/TestGrouping.java
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ modules/grouping/src/test/org/apache/lucene/search/grouping/TestGrouping.java Wed May 11 18:35:35 2011 -0400
@@ -0,0 +1,539 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.search.grouping;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.List;
+
+import org.apache.lucene.analysis.MockAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.NumericField;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.RandomIndexWriter;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.Collector;
+import org.apache.lucene.search.FieldCache;
+import org.apache.lucene.search.FieldDoc;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.ScoreDoc;
+import org.apache.lucene.search.Sort;
+import org.apache.lucene.search.SortField;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util._TestUtil;
+
+// TODO
+// - should test relevance sort too
+// - test null
+// - test ties
+// - test compound sort
+
+public class TestGrouping extends LuceneTestCase {
+
+ public void testBasic() throws Exception {
+
+ final String groupField = "author";
+
+ Directory dir = newDirectory();
+ RandomIndexWriter w = new RandomIndexWriter(
+ random,
+ dir,
+ newIndexWriterConfig(TEST_VERSION_CURRENT,
+ new MockAnalyzer(random)).setMergePolicy(newLogMergePolicy()));
+ // 0
+ Document doc = new Document();
+ doc.add(new Field(groupField, "author1", Field.Store.YES, Field.Index.ANALYZED));
+ doc.add(new Field("content", "random text", Field.Store.YES, Field.Index.ANALYZED));
+ doc.add(new Field("id", "1", Field.Store.YES, Field.Index.NO));
+ w.addDocument(doc);
+
+ // 1
+ doc = new Document();
+ doc.add(new Field(groupField, "author1", Field.Store.YES, Field.Index.ANALYZED));
+ doc.add(new Field("content", "some more random text", Field.Store.YES, Field.Index.ANALYZED));
+ doc.add(new Field("id", "2", Field.Store.YES, Field.Index.NO));
+ w.addDocument(doc);
+
+ // 2
+ doc = new Document();
+ doc.add(new Field(groupField, "author1", Field.Store.YES, Field.Index.ANALYZED));
+ doc.add(new Field("content", "some more random textual data", Field.Store.YES, Field.Index.ANALYZED));
+ doc.add(new Field("id", "3", Field.Store.YES, Field.Index.NO));
+ w.addDocument(doc);
+
+ // 3
+ doc = new Document();
+ doc.add(new Field(groupField, "author2", Field.Store.YES, Field.Index.ANALYZED));
+ doc.add(new Field("content", "some random text", Field.Store.YES, Field.Index.ANALYZED));
+ doc.add(new Field("id", "4", Field.Store.YES, Field.Index.NO));
+ w.addDocument(doc);
+
+ // 4
+ doc = new Document();
+ doc.add(new Field(groupField, "author3", Field.Store.YES, Field.Index.ANALYZED));
+ doc.add(new Field("content", "some more random text", Field.Store.YES, Field.Index.ANALYZED));
+ doc.add(new Field("id", "5", Field.Store.YES, Field.Index.NO));
+ w.addDocument(doc);
+
+ // 5
+ doc = new Document();
+ doc.add(new Field(groupField, "author3", Field.Store.YES, Field.Index.ANALYZED));
+ doc.add(new Field("content", "random", Field.Store.YES, Field.Index.ANALYZED));
+ doc.add(new Field("id", "6", Field.Store.YES, Field.Index.NO));
+ w.addDocument(doc);
+
+ // 6 -- no author field
+ doc = new Document();
+ doc.add(new Field("content", "random word stuck in alot of other text", Field.Store.YES, Field.Index.ANALYZED));
+ doc.add(new Field("id", "6", Field.Store.YES, Field.Index.NO));
+ w.addDocument(doc);
+
+ IndexSearcher indexSearcher = new IndexSearcher(w.getReader());
+ w.close();
+
+ final Sort groupSort = Sort.RELEVANCE;
+ final FirstPassGroupingCollector c1 = new FirstPassGroupingCollector(groupField, groupSort, 10);
+ indexSearcher.search(new TermQuery(new Term("content", "random")), c1);
+
+ final SecondPassGroupingCollector c2 = new SecondPassGroupingCollector(groupField, c1.getTopGroups(0, true), groupSort, null, 5, true, false, true);
+ indexSearcher.search(new TermQuery(new Term("content", "random")), c2);
+
+ final TopGroups groups = c2.getTopGroups(0);
+
+ assertEquals(7, groups.totalHitCount);
+ assertEquals(7, groups.totalGroupedHitCount);
+ assertEquals(4, groups.groups.length);
+
+ // relevance order: 5, 0, 3, 4, 1, 2, 6
+
+ // the later a document is added the higher this docId
+ // value
+ GroupDocs group = groups.groups[0];
+ assertEquals(new BytesRef("author3"), group.groupValue);
+ assertEquals(2, group.scoreDocs.length);
+ assertEquals(5, group.scoreDocs[0].doc);
+ assertEquals(4, group.scoreDocs[1].doc);
+ assertTrue(group.scoreDocs[0].score > group.scoreDocs[1].score);
+
+ group = groups.groups[1];
+ assertEquals(new BytesRef("author1"), group.groupValue);
+ assertEquals(3, group.scoreDocs.length);
+ assertEquals(0, group.scoreDocs[0].doc);
+ assertEquals(1, group.scoreDocs[1].doc);
+ assertEquals(2, group.scoreDocs[2].doc);
+ assertTrue(group.scoreDocs[0].score > group.scoreDocs[1].score);
+ assertTrue(group.scoreDocs[1].score > group.scoreDocs[2].score);
+
+ group = groups.groups[2];
+ assertEquals(new BytesRef("author2"), group.groupValue);
+ assertEquals(1, group.scoreDocs.length);
+ assertEquals(3, group.scoreDocs[0].doc);
+
+ group = groups.groups[3];
+ assertNull(group.groupValue);
+ assertEquals(1, group.scoreDocs.length);
+ assertEquals(6, group.scoreDocs[0].doc);
+
+ indexSearcher.getIndexReader().close();
+ dir.close();
+ }
+
+ private static class GroupDoc {
+ final int id;
+ final BytesRef group;
+ final BytesRef sort1;
+ final BytesRef sort2;
+ final String content;
+
+ public GroupDoc(int id, BytesRef group, BytesRef sort1, BytesRef sort2, String content) {
+ this.id = id;
+ this.group = group;
+ this.sort1 = sort1;
+ this.sort2 = sort2;
+ this.content = content;
+ }
+ }
+
+ private Sort getRandomSort() {
+ final List sortFields = new ArrayList();
+ if (random.nextBoolean()) {
+ if (random.nextBoolean()) {
+ sortFields.add(new SortField("sort1", SortField.STRING, random.nextBoolean()));
+ } else {
+ sortFields.add(new SortField("sort2", SortField.STRING, random.nextBoolean()));
+ }
+ } else if (random.nextBoolean()) {
+ sortFields.add(new SortField("sort1", SortField.STRING, random.nextBoolean()));
+ sortFields.add(new SortField("sort2", SortField.STRING, random.nextBoolean()));
+ }
+ sortFields.add(new SortField("id", SortField.INT));
+ return new Sort(sortFields.toArray(new SortField[sortFields.size()]));
+ }
+
+ private Comparator getComparator(Sort sort) {
+ final SortField[] sortFields = sort.getSort();
+ return new Comparator() {
+ public int compare(GroupDoc d1, GroupDoc d2) {
+ for(SortField sf : sortFields) {
+ final int cmp;
+ if (sf.getField().equals("sort1")) {
+ cmp = d1.sort1.compareTo(d2.sort1);
+ } else if (sf.getField().equals("sort2")) {
+ cmp = d1.sort2.compareTo(d2.sort2);
+ } else {
+ assertEquals(sf.getField(), "id");
+ cmp = d1.id - d2.id;
+ }
+ if (cmp != 0) {
+ return sf.getReverse() ? -cmp : cmp;
+ }
+ }
+ // Our sort always fully tie breaks:
+ fail();
+ return 0;
+ }
+ };
+ }
+
+ private Comparable[] fillFields(GroupDoc d, Sort sort) {
+ final SortField[] sortFields = sort.getSort();
+ final Comparable[] fields = new Comparable[sortFields.length];
+ for(int fieldIDX=0;fieldIDX groupSortComp = getComparator(groupSort);
+
+ Arrays.sort(groupDocs, groupSortComp);
+ final HashMap> groups = new HashMap>();
+ final List sortedGroups = new ArrayList();
+ final List sortedGroupFields = new ArrayList();
+
+ int totalHitCount = 0;
+
+ for(GroupDoc d : groupDocs) {
+ // TODO: would be better to filter by searchTerm before sorting!
+ if (!d.content.equals(searchTerm)) {
+ continue;
+ }
+ totalHitCount++;
+ List l = groups.get(d.group);
+ if (l == null) {
+ sortedGroups.add(d.group);
+ if (fillFields) {
+ sortedGroupFields.add(fillFields(d, groupSort));
+ }
+ l = new ArrayList();
+ groups.put(d.group, l);
+ }
+ l.add(d);
+ }
+
+ if (groupOffset >= sortedGroups.size()) {
+ // slice is out of bounds
+ return null;
+ }
+
+ final int limit = Math.min(groupOffset + topNGroups, groups.size());
+
+ final Comparator docSortComp = getComparator(docSort);
+ final GroupDocs[] result = new GroupDocs[limit-groupOffset];
+ int totalGroupedHitCount = 0;
+ for(int idx=groupOffset;idx < limit;idx++) {
+ final BytesRef group = sortedGroups.get(idx);
+ final List docs = groups.get(group);
+ totalGroupedHitCount += docs.size();
+ Collections.sort(docs, docSortComp);
+ final ScoreDoc[] hits;
+ if (docs.size() > docOffset) {
+ final int docIDXLimit = Math.min(docOffset + docsPerGroup, docs.size());
+ hits = new ScoreDoc[docIDXLimit - docOffset];
+ for(int docIDX=docOffset; docIDX < docIDXLimit; docIDX++) {
+ final GroupDoc d = docs.get(docIDX);
+ final FieldDoc fd;
+ if (fillFields) {
+ fd = new FieldDoc(d.id, 0.0f, fillFields(d, docSort));
+ } else {
+ fd = new FieldDoc(d.id, 0.0f);
+ }
+ hits[docIDX-docOffset] = fd;
+ }
+ } else {
+ hits = new ScoreDoc[0];
+ }
+
+ result[idx-groupOffset] = new GroupDocs(0.0f,
+ docs.size(),
+ hits,
+ group,
+ fillFields ? sortedGroupFields.get(idx) : null);
+ }
+
+ return new TopGroups(groupSort.getSort(), docSort.getSort(), totalHitCount, totalGroupedHitCount, result);
+ }
+
+ public void testRandom() throws Exception {
+ for(int iter=0;iter<3;iter++) {
+
+ if (VERBOSE) {
+ System.out.println("TEST: iter=" + iter);
+ }
+
+ final int numDocs = _TestUtil.nextInt(random, 100, 1000) * RANDOM_MULTIPLIER;
+ //final int numDocs = _TestUtil.nextInt(random, 5, 20);
+
+ final int numGroups = _TestUtil.nextInt(random, 1, numDocs);
+
+ if (VERBOSE) {
+ System.out.println("TEST: numDocs=" + numDocs + " numGroups=" + numGroups);
+ }
+
+ final List groups = new ArrayList();
+ for(int i=0;i topGroups = c1.getTopGroups(groupOffset, fillFields);
+ final TopGroups groupsResult;
+
+ if (topGroups != null) {
+
+ if (VERBOSE) {
+ System.out.println("TEST: topGroups");
+ for (SearchGroup searchGroup : topGroups) {
+ System.out.println(" " + (searchGroup.groupValue == null ? "null" : searchGroup.groupValue.utf8ToString()) + ": " + Arrays.deepToString(searchGroup.sortValues));
+ }
+ }
+
+ final SecondPassGroupingCollector c2 = new SecondPassGroupingCollector("group", topGroups, groupSort, docSort, docOffset+docsPerGroup, getScores, getMaxScores, fillFields);
+ if (doCache) {
+ if (cCache.isCached()) {
+ if (VERBOSE) {
+ System.out.println("TEST: cache is intact");
+ }
+ cCache.replay(c2);
+ } else {
+ if (VERBOSE) {
+ System.out.println("TEST: cache was too large");
+ }
+ s.search(new TermQuery(new Term("content", searchTerm)), c2);
+ }
+ } else {
+ s.search(new TermQuery(new Term("content", searchTerm)), c2);
+ }
+
+ groupsResult = c2.getTopGroups(docOffset);
+ } else {
+ groupsResult = null;
+ if (VERBOSE) {
+ System.out.println("TEST: no results");
+ }
+ }
+
+ final TopGroups expectedGroups = slowGrouping(groupDocs, searchTerm, fillFields, getScores, getMaxScores, groupSort, docSort, topNGroups, docsPerGroup, groupOffset, docOffset);
+
+ try {
+ // NOTE: intentional but temporary field cache insanity!
+ assertEquals(FieldCache.DEFAULT.getInts(r, "id"), expectedGroups, groupsResult);
+ } finally {
+ FieldCache.DEFAULT.purge(r);
+ }
+ }
+
+ r.close();
+ dir.close();
+ }
+ }
+
+ private void assertEquals(int[] docIDtoID, TopGroups expected, TopGroups actual) {
+ if (expected == null) {
+ assertNull(actual);
+ return;
+ }
+ assertNotNull(actual);
+
+ assertEquals(expected.groups.length, actual.groups.length);
+ assertEquals(expected.totalHitCount, actual.totalHitCount);
+ assertEquals(expected.totalGroupedHitCount, actual.totalGroupedHitCount);
+
+ for(int groupIDX=0;groupIDX