Index: lucene/facet/src/java/org/apache/lucene/facet/search/CachedIntsCountingFacetsAggregator.java =================================================================== --- lucene/facet/src/java/org/apache/lucene/facet/search/CachedIntsCountingFacetsAggregator.java (revision 0) +++ lucene/facet/src/java/org/apache/lucene/facet/search/CachedIntsCountingFacetsAggregator.java (working copy) @@ -0,0 +1,152 @@ +package org.apache.lucene.facet.search; + +import java.io.IOException; +import java.util.Map; +import java.util.WeakHashMap; + +import org.apache.lucene.facet.params.CategoryListParams; +import org.apache.lucene.facet.search.FacetsCollector.MatchingDocs; +import org.apache.lucene.facet.taxonomy.TaxonomyReader; +import org.apache.lucene.index.BinaryDocValues; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BytesRef; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * A {@link FacetsAggregator} which updates categories values by counting their + * occurrences in matching documents. This aggregator caches the ordinals of + * each document in-memory, in two parallel {@code int[]} arrays, and therefore + * consumes more RAM than if they were kept "compressed" in-memory, however + * achieves significant performance improvements. If you use this aggregator, + * make sure you have enough RAM to cache the ordinals, and that you set your + * heap size accordingly. + * + *

+ * NOTE: this aggregator is limited to roughly 2.1B total categories + * found in the documents of a single segment. If that is a limitation for you + * then consider limiting the segment size to less documents, or write a + * different aggregator which pages through the categories in the segment. + */ +public class CachedIntsCountingFacetsAggregator implements FacetsAggregator { + + private static final class CachedInts { + + final int[] offsets; + final int[] values; + + public CachedInts(BinaryDocValues dv, int maxDoc) { + final BytesRef buf = new BytesRef(); + + offsets = new int[maxDoc + 1]; + int[] ords = new int[maxDoc]; // let's assume one ordinal per-document as an initial size + + // this aggregator is limited to Integer.MAX_VALUE total ordinals. + int totOrds = 0; + for (int docID = 0; docID < maxDoc; docID++) { + offsets[docID] = totOrds; + dv.get(docID, buf); + if (buf.length > 0) { + // this document has facets + int upto = buf.offset + buf.length; + int ord = 0; + int offset = buf.offset; + int prev = 0; + while (offset < upto) { + byte b = buf.bytes[offset++]; + if (b >= 0) { + prev = ord = ((ord << 7) | b) + prev; + if (totOrds == ords.length) { + ords = ArrayUtil.grow(ords, 1 + totOrds); + } + ords[totOrds] = ord; + totOrds++; + ord = 0; + } else { + ord = (ord << 7) | (b & 0x7F); + } + } + } + } + offsets[maxDoc] = totOrds; + + // if ords array is bigger by more than 10% of what we really need, shrink it + if ((ords.length / (double) totOrds) - 1.0 > 0.1) { + this.values = new int[totOrds]; + System.arraycopy(ords, 0, this.values, 0, totOrds); + } else { + this.values = ords; + } + } + } + + private static final Map intsCache = new WeakHashMap(); + + private static synchronized CachedInts getCachedInts(BinaryDocValues dv, int maxDoc) { + CachedInts ci = intsCache.get(dv); + if (ci == null) { + ci = new CachedInts(dv, maxDoc); + intsCache.put(dv, ci); + } + return ci; + } + + @Override + public void aggregate(MatchingDocs matchingDocs, CategoryListParams clp, FacetArrays facetArrays) throws IOException { + BinaryDocValues dv = matchingDocs.context.reader().getBinaryDocValues(clp.field); + if (dv == null) { + return; // no categories for this reader + } + final int[] counts = facetArrays.getIntArray(); + final CachedInts ci = getCachedInts(dv, matchingDocs.context.reader().maxDoc()); + int doc = 0; + int length = matchingDocs.bits.length(); + while (doc < length && (doc = matchingDocs.bits.nextSetBit(doc)) != -1) { + int start = ci.offsets[doc]; + int end = ci.offsets[doc + 1]; + for (int i = start; i < end; i++) { + ++counts[ci.values[i]]; + } + ++doc; + } + } + + private int rollupCounts(int ordinal, int[] children, int[] siblings, int[] counts) { + int count = 0; + while (ordinal != TaxonomyReader.INVALID_ORDINAL) { + int childCount = counts[ordinal]; + childCount += rollupCounts(children[ordinal], children, siblings, counts); + counts[ordinal] = childCount; + count += childCount; + ordinal = siblings[ordinal]; + } + return count; + } + + @Override + public final void rollupValues(FacetRequest fr, int ordinal, int[] children, int[] siblings, FacetArrays facetArrays) { + final int[] counts = facetArrays.getIntArray(); + counts[ordinal] += rollupCounts(children[ordinal], children, siblings, counts); + } + + @Override + public boolean requiresDocScores() { + return false; + } + +} Property changes on: lucene/facet/src/java/org/apache/lucene/facet/search/CachedIntsCountingFacetsAggregator.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/facet/src/test/org/apache/lucene/facet/search/CountingFacetsAggregatorTest.java =================================================================== --- lucene/facet/src/test/org/apache/lucene/facet/search/CountingFacetsAggregatorTest.java (revision 1444748) +++ lucene/facet/src/test/org/apache/lucene/facet/search/CountingFacetsAggregatorTest.java (working copy) @@ -270,7 +270,15 @@ } private FacetsAccumulator randomAccumulator(FacetSearchParams fsp, IndexReader indexReader, TaxonomyReader taxoReader) { - final FacetsAggregator aggregator = random().nextBoolean() ? new CountingFacetsAggregator() : new FastCountingFacetsAggregator(); + final FacetsAggregator aggregator; + double val = random().nextDouble(); + if (val < 0.6) { + aggregator = new FastCountingFacetsAggregator(); // it's the default, so give it the highest chance + } else if (val < 0.8) { + aggregator = new CountingFacetsAggregator(); + } else { + aggregator = new CachedIntsCountingFacetsAggregator(); + } return new FacetsAccumulator(fsp, indexReader, taxoReader) { @Override public FacetsAggregator getAggregator() {