Index: lucene/CHANGES.txt =================================================================== --- lucene/CHANGES.txt (revision 1445194) +++ lucene/CHANGES.txt (working copy) @@ -159,6 +159,11 @@ to Lucene42DocValuesConsumer) if you want to make this tradeoff. (Adrien Grand, Robert Muir) +* LUCENE-4769: Added OrdinalsCache and CachedOrdsCountingFacetsAggregator + which uses the cache to obtain a document's ordinals. This aggregator + is faster than others, however consumes much more RAM. + (Michael McCandless, Shai Erera) + API Changes * LUCENE-4709: FacetResultNode no longer has a residue field. (Shai Erera) Index: lucene/facet/src/java/org/apache/lucene/facet/encoding/IntDecoder.java =================================================================== --- lucene/facet/src/java/org/apache/lucene/facet/encoding/IntDecoder.java (revision 1445194) +++ lucene/facet/src/java/org/apache/lucene/facet/encoding/IntDecoder.java (working copy) @@ -29,7 +29,8 @@ /** * Decodes the values from the buffer into the given {@link IntsRef}. Note - * that {@code values.offset} and {@code values.length} are set to 0. + * that {@code values.offset} is set to 0, and {@code values.length} is + * updated to denote the number of decoded values. */ public abstract void decode(BytesRef buf, IntsRef values); Index: lucene/facet/src/java/org/apache/lucene/facet/search/CachedOrdsCountingFacetsAggregator.java =================================================================== --- lucene/facet/src/java/org/apache/lucene/facet/search/CachedOrdsCountingFacetsAggregator.java (revision 0) +++ lucene/facet/src/java/org/apache/lucene/facet/search/CachedOrdsCountingFacetsAggregator.java (working copy) @@ -0,0 +1,54 @@ +package org.apache.lucene.facet.search; + +import java.io.IOException; + +import org.apache.lucene.facet.params.CategoryListParams; +import org.apache.lucene.facet.search.FacetsCollector.MatchingDocs; +import org.apache.lucene.facet.search.OrdinalsCache.CachedOrds; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * A {@link FacetsAggregator} which updates categories values by counting their + * occurrences in matching documents. Uses {@link OrdinalsCache} to obtain the + * category ordinals of each segment. + * + * @lucene.experimental + */ +public class CachedOrdsCountingFacetsAggregator extends IntRollupFacetsAggregator { + + @Override + public void aggregate(MatchingDocs matchingDocs, CategoryListParams clp, FacetArrays facetArrays) throws IOException { + final CachedOrds ords = OrdinalsCache.getCachedOrds(matchingDocs.context, clp); + if (ords == null) { + return; // this segment has no ordinals for the given category list + } + final int[] counts = facetArrays.getIntArray(); + int doc = 0; + int length = matchingDocs.bits.length(); + while (doc < length && (doc = matchingDocs.bits.nextSetBit(doc)) != -1) { + int start = ords.offsets[doc]; + int end = ords.offsets[doc + 1]; + for (int i = start; i < end; i++) { + ++counts[ords.ordinals[i]]; + } + ++doc; + } + } + +} Property changes on: lucene/facet/src/java/org/apache/lucene/facet/search/CachedOrdsCountingFacetsAggregator.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/facet/src/java/org/apache/lucene/facet/search/CountingFacetsAggregator.java =================================================================== --- lucene/facet/src/java/org/apache/lucene/facet/search/CountingFacetsAggregator.java (revision 1445194) +++ lucene/facet/src/java/org/apache/lucene/facet/search/CountingFacetsAggregator.java (working copy) @@ -4,7 +4,6 @@ import org.apache.lucene.facet.params.CategoryListParams; import org.apache.lucene.facet.search.FacetsCollector.MatchingDocs; -import org.apache.lucene.facet.taxonomy.TaxonomyReader; import org.apache.lucene.util.IntsRef; /* @@ -33,7 +32,7 @@ * * @lucene.experimental */ -public class CountingFacetsAggregator implements FacetsAggregator { +public class CountingFacetsAggregator extends IntRollupFacetsAggregator { private final IntsRef ordinals = new IntsRef(32); @@ -57,27 +56,4 @@ } } - private int rollupCounts(int ordinal, int[] children, int[] siblings, int[] counts) { - int count = 0; - while (ordinal != TaxonomyReader.INVALID_ORDINAL) { - int childCount = counts[ordinal]; - childCount += rollupCounts(children[ordinal], children, siblings, counts); - counts[ordinal] = childCount; - count += childCount; - ordinal = siblings[ordinal]; - } - return count; - } - - @Override - public void rollupValues(FacetRequest fr, int ordinal, int[] children, int[] siblings, FacetArrays facetArrays) { - final int[] counts = facetArrays.getIntArray(); - counts[ordinal] += rollupCounts(children[ordinal], children, siblings, counts); - } - - @Override - public final boolean requiresDocScores() { - return false; - } - } Index: lucene/facet/src/java/org/apache/lucene/facet/search/FastCountingFacetsAggregator.java =================================================================== --- lucene/facet/src/java/org/apache/lucene/facet/search/FastCountingFacetsAggregator.java (revision 1445194) +++ lucene/facet/src/java/org/apache/lucene/facet/search/FastCountingFacetsAggregator.java (working copy) @@ -7,7 +7,6 @@ import org.apache.lucene.facet.params.CategoryListParams; import org.apache.lucene.facet.params.FacetSearchParams; import org.apache.lucene.facet.search.FacetsCollector.MatchingDocs; -import org.apache.lucene.facet.taxonomy.TaxonomyReader; import org.apache.lucene.index.BinaryDocValues; import org.apache.lucene.util.BytesRef; @@ -37,7 +36,7 @@ * * @lucene.experimental */ -public final class FastCountingFacetsAggregator implements FacetsAggregator { +public final class FastCountingFacetsAggregator extends IntRollupFacetsAggregator { private final BytesRef buf = new BytesRef(32); @@ -95,27 +94,4 @@ } } - private int rollupCounts(int ordinal, int[] children, int[] siblings, int[] counts) { - int count = 0; - while (ordinal != TaxonomyReader.INVALID_ORDINAL) { - int childCount = counts[ordinal]; - childCount += rollupCounts(children[ordinal], children, siblings, counts); - counts[ordinal] = childCount; - count += childCount; - ordinal = siblings[ordinal]; - } - return count; - } - - @Override - public final void rollupValues(FacetRequest fr, int ordinal, int[] children, int[] siblings, FacetArrays facetArrays) { - final int[] counts = facetArrays.getIntArray(); - counts[ordinal] += rollupCounts(children[ordinal], children, siblings, counts); - } - - @Override - public final boolean requiresDocScores() { - return false; - } - } Index: lucene/facet/src/java/org/apache/lucene/facet/search/IntRollupFacetsAggregator.java =================================================================== --- lucene/facet/src/java/org/apache/lucene/facet/search/IntRollupFacetsAggregator.java (revision 0) +++ lucene/facet/src/java/org/apache/lucene/facet/search/IntRollupFacetsAggregator.java (working copy) @@ -0,0 +1,63 @@ +package org.apache.lucene.facet.search; + +import java.io.IOException; + +import org.apache.lucene.facet.params.CategoryListParams; +import org.apache.lucene.facet.search.FacetsCollector.MatchingDocs; +import org.apache.lucene.facet.taxonomy.TaxonomyReader; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * A {@link FacetsAggregator} which implements + * {@link #rollupValues(FacetRequest, int, int[], int[], FacetArrays)} by + * summing the values from {@link FacetArrays#getIntArray()}. Extending classes + * should only implement {@link #aggregate}. Also, {@link #requiresDocScores()} + * always returns false. + * + * @lucene.experimental + */ +public abstract class IntRollupFacetsAggregator implements FacetsAggregator { + + @Override + public abstract void aggregate(MatchingDocs matchingDocs, CategoryListParams clp, FacetArrays facetArrays) throws IOException; + + private int rollupValues(int ordinal, int[] children, int[] siblings, int[] values) { + int value = 0; + while (ordinal != TaxonomyReader.INVALID_ORDINAL) { + int childValue = values[ordinal]; + childValue += rollupValues(children[ordinal], children, siblings, values); + values[ordinal] = childValue; + value += childValue; + ordinal = siblings[ordinal]; + } + return value; + } + + @Override + public final void rollupValues(FacetRequest fr, int ordinal, int[] children, int[] siblings, FacetArrays facetArrays) { + final int[] values = facetArrays.getIntArray(); + values[ordinal] += rollupValues(children[ordinal], children, siblings, values); + } + + @Override + public final boolean requiresDocScores() { + return false; + } + +} Property changes on: lucene/facet/src/java/org/apache/lucene/facet/search/IntRollupFacetsAggregator.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/facet/src/java/org/apache/lucene/facet/search/OrdinalsCache.java =================================================================== --- lucene/facet/src/java/org/apache/lucene/facet/search/OrdinalsCache.java (revision 0) +++ lucene/facet/src/java/org/apache/lucene/facet/search/OrdinalsCache.java (working copy) @@ -0,0 +1,118 @@ +package org.apache.lucene.facet.search; + +import java.io.IOException; +import java.util.Map; +import java.util.WeakHashMap; + +import org.apache.lucene.codecs.DocValuesFormat; +import org.apache.lucene.facet.encoding.IntDecoder; +import org.apache.lucene.facet.params.CategoryListParams; +import org.apache.lucene.index.AtomicReaderContext; +import org.apache.lucene.index.BinaryDocValues; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IntsRef; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * A per-segment cache of documents' category ordinals. Every {@link CachedOrds} + * holds the ordinals in a raw {@code int[]}, and therefore consumes as much RAM + * as the total number of ordinals found in the segment. + * + *

+ * NOTE: every {@link CachedOrds} is limited to 2.1B total ordinals. If + * that is a limitation for you then consider limiting the segment size to less + * documents, or use an alternative cache which pages through the category + * ordinals. + * + *

+ * NOTE: when using this cache, it is advised to use a + * {@link DocValuesFormat} that does not cache the data in memory, at least for + * the category lists fields, or otherwise you'll be doing double-caching. + */ +public class OrdinalsCache { + + /** Holds the cached ordinals in two paralel {@code int[]} arrays. */ + public static final class CachedOrds { + + public final int[] offsets; + public final int[] ordinals; + + /** + * Creates a new {@link CachedOrds} from the {@link BinaryDocValues}. + * Assumes that the {@link BinaryDocValues} is not {@code null}. + */ + public CachedOrds(BinaryDocValues dv, int maxDoc, CategoryListParams clp) { + final BytesRef buf = new BytesRef(); + + offsets = new int[maxDoc + 1]; + int[] ords = new int[maxDoc]; // let's assume one ordinal per-document as an initial size + + // this aggregator is limited to Integer.MAX_VALUE total ordinals. + int totOrds = 0; + final IntDecoder decoder = clp.createEncoder().createMatchingDecoder(); + final IntsRef values = new IntsRef(32); + for (int docID = 0; docID < maxDoc; docID++) { + offsets[docID] = totOrds; + dv.get(docID, buf); + if (buf.length > 0) { + // this document has facets + decoder.decode(buf, values); + if (totOrds + values.length >= ords.length) { + ords = ArrayUtil.grow(ords, totOrds + values.length + 1); + } + for (int i = 0; i < values.length; i++) { + ords[totOrds++] = values.ints[i]; + } + } + } + offsets[maxDoc] = totOrds; + + // if ords array is bigger by more than 10% of what we really need, shrink it + if ((double) totOrds / ords.length < 0.9) { + this.ordinals = new int[totOrds]; + System.arraycopy(ords, 0, this.ordinals, 0, totOrds); + } else { + this.ordinals = ords; + } + } + } + + private static final Map intsCache = new WeakHashMap(); + + /** + * Returns the {@link CachedOrds} relevant to the given + * {@link AtomicReaderContext}, or {@code null} if there is no + * {@link BinaryDocValues} in this reader for the requested + * {@link CategoryListParams#field}. + */ + public static synchronized CachedOrds getCachedOrds(AtomicReaderContext context, CategoryListParams clp) throws IOException { + BinaryDocValues dv = context.reader().getBinaryDocValues(clp.field); + if (dv == null) { + return null; + } + CachedOrds ci = intsCache.get(dv); + if (ci == null) { + ci = new CachedOrds(dv, context.reader().maxDoc(), clp); + intsCache.put(dv, ci); + } + return ci; + } + +} Property changes on: lucene/facet/src/java/org/apache/lucene/facet/search/OrdinalsCache.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/facet/src/test/org/apache/lucene/facet/search/CountingFacetsAggregatorTest.java =================================================================== --- lucene/facet/src/test/org/apache/lucene/facet/search/CountingFacetsAggregatorTest.java (revision 1445194) +++ lucene/facet/src/test/org/apache/lucene/facet/search/CountingFacetsAggregatorTest.java (working copy) @@ -270,7 +270,15 @@ } private FacetsAccumulator randomAccumulator(FacetSearchParams fsp, IndexReader indexReader, TaxonomyReader taxoReader) { - final FacetsAggregator aggregator = random().nextBoolean() ? new CountingFacetsAggregator() : new FastCountingFacetsAggregator(); + final FacetsAggregator aggregator; + double val = random().nextDouble(); + if (val < 0.6) { + aggregator = new FastCountingFacetsAggregator(); // it's the default, so give it the highest chance + } else if (val < 0.8) { + aggregator = new CountingFacetsAggregator(); + } else { + aggregator = new CachedOrdsCountingFacetsAggregator(); + } return new FacetsAccumulator(fsp, indexReader, taxoReader) { @Override public FacetsAggregator getAggregator() {