Index: lucene/facet/src/java/org/apache/lucene/facet/search/CachedIntsCountingFacetsAggregator.java =================================================================== --- lucene/facet/src/java/org/apache/lucene/facet/search/CachedIntsCountingFacetsAggregator.java (revision 0) +++ lucene/facet/src/java/org/apache/lucene/facet/search/CachedIntsCountingFacetsAggregator.java (working copy) @@ -0,0 +1,152 @@ +package org.apache.lucene.facet.search; + +import java.io.IOException; +import java.util.Map; +import java.util.WeakHashMap; + +import org.apache.lucene.facet.params.CategoryListParams; +import org.apache.lucene.facet.search.FacetsCollector.MatchingDocs; +import org.apache.lucene.facet.taxonomy.TaxonomyReader; +import org.apache.lucene.index.BinaryDocValues; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BytesRef; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * A {@link FacetsAggregator} which updates categories values by counting their + * occurrences in matching documents. This aggregator caches the ordinals of + * each document in-memory, in two parallel {@code int[]} arrays, and therefore + * consumes more RAM than if they were kept "compressed" in-memory, however + * achieves significant performance improvements. If you use this aggregator, + * make sure you have enough RAM to cache the ordinals, and that you set your + * heap size accordingly. + * + *
+ * NOTE: this aggregator is limited to roughly 2.1B total categories
+ * found in the documents of a single segment. If that is a limitation for you
+ * then consider limiting the segment size to less documents, or write a
+ * different aggregator which pages through the categories in the segment.
+ */
+public class CachedIntsCountingFacetsAggregator implements FacetsAggregator {
+
+ private static final class CachedInts {
+
+ final int[] offsets;
+ final int[] values;
+
+ public CachedInts(BinaryDocValues dv, int maxDoc) {
+ final BytesRef buf = new BytesRef();
+
+ offsets = new int[maxDoc + 1];
+ int[] ords = new int[maxDoc]; // let's assume one ordinal per-document as an initial size
+
+ // this aggregator is limited to Integer.MAX_VALUE total ordinals.
+ int totOrds = 0;
+ for (int docID = 0; docID < maxDoc; docID++) {
+ offsets[docID] = totOrds;
+ dv.get(docID, buf);
+ if (buf.length > 0) {
+ // this document has facets
+ int upto = buf.offset + buf.length;
+ int ord = 0;
+ int offset = buf.offset;
+ int prev = 0;
+ while (offset < upto) {
+ byte b = buf.bytes[offset++];
+ if (b >= 0) {
+ prev = ord = ((ord << 7) | b) + prev;
+ if (totOrds == ords.length) {
+ ords = ArrayUtil.grow(ords, 1 + totOrds);
+ }
+ ords[totOrds] = ord;
+ totOrds++;
+ ord = 0;
+ } else {
+ ord = (ord << 7) | (b & 0x7F);
+ }
+ }
+ }
+ }
+ offsets[maxDoc] = totOrds;
+
+ // if ords array is bigger by more than 10% of what we really need, shrink it
+ if ((ords.length / (double) totOrds) - 1.0 > 0.1) {
+ this.values = new int[totOrds];
+ System.arraycopy(ords, 0, this.values, 0, totOrds);
+ } else {
+ this.values = ords;
+ }
+ }
+ }
+
+ private static final Map