Index: lucene/facet/src/test/org/apache/lucene/facet/search/TestDemoFacets.java =================================================================== --- lucene/facet/src/test/org/apache/lucene/facet/search/TestDemoFacets.java (revision 1455276) +++ lucene/facet/src/test/org/apache/lucene/facet/search/TestDemoFacets.java (working copy) @@ -27,9 +27,11 @@ import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; +import org.apache.lucene.document.SortedSetDocValuesField; import org.apache.lucene.facet.FacetTestCase; import org.apache.lucene.facet.FacetTestUtils; import org.apache.lucene.facet.index.FacetFields; +import org.apache.lucene.facet.index.SortedSetDocValuesFacetField; import org.apache.lucene.facet.params.CategoryListParams; import org.apache.lucene.facet.params.FacetIndexingParams; import org.apache.lucene.facet.params.FacetSearchParams; @@ -243,4 +245,68 @@ dir.close(); taxoDir.close(); } + + public void testSortedSetDocValuesAccumulator() throws Exception { + assumeTrue("Test requires SortedSetDV support", defaultCodecSupportsSortedSet()); + Directory dir = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random(), dir); + + Document doc = new Document(); + doc.add(new SortedSetDocValuesFacetField(new CategoryPath("a", "foo"))); + doc.add(new SortedSetDocValuesFacetField(new CategoryPath("a", "bar"))); + doc.add(new SortedSetDocValuesFacetField(new CategoryPath("a", "zoo"))); + doc.add(new SortedSetDocValuesFacetField(new CategoryPath("b", "baz"))); + doc.add(new SortedSetDocValuesFacetField(new CategoryPath("b", "baz" + FacetIndexingParams.DEFAULT_FACET_DELIM_CHAR + "foo"))); + writer.addDocument(doc); + if (random().nextBoolean()) { + writer.commit(); + } + + doc = new Document(); + doc.add(new SortedSetDocValuesFacetField(new CategoryPath("a", "foo"))); + writer.addDocument(doc); + + // NRT open + IndexSearcher searcher = newSearcher(writer.getReader()); + writer.close(); + + List requests = new ArrayList(); + requests.add(new CountFacetRequest(new CategoryPath("a"), 10)); + requests.add(new CountFacetRequest(new CategoryPath("b"), 10)); + + final boolean doDimCount = random().nextBoolean(); + + CategoryListParams clp = new CategoryListParams() { + @Override + public OrdinalPolicy getOrdinalPolicy(String dimension) { + return doDimCount ? OrdinalPolicy.NO_PARENTS : OrdinalPolicy.ALL_BUT_DIMENSION; + } + }; + + FacetSearchParams fsp = new FacetSearchParams(new FacetIndexingParams(clp), requests); + + // Per-top-reader state: + SortedSetDocValuesReaderState state = new SortedSetDocValuesReaderState(searcher.getIndexReader()); + + //SortedSetDocValuesCollector c = new SortedSetDocValuesCollector(state); + //SortedSetDocValuesCollectorMergeBySeg c = new SortedSetDocValuesCollectorMergeBySeg(state); + + FacetsCollector c = FacetsCollector.create(new SortedSetDocValuesAccumulator(fsp, state)); + + searcher.search(new MatchAllDocsQuery(), c); + + //List results = c.getFacetResults(requests); + List results = c.getFacetResults(); + + assertEquals(2, results.size()); + + int dimCount = doDimCount ? 4 : 0; + assertEquals("a (" + dimCount + ")\n foo (2)\n bar (1)\n zoo (1)\n", FacetTestUtils.toSimpleString(results.get(0))); + + dimCount = doDimCount ? 2 : 0; + assertEquals("b (" + dimCount + ")\n baz (1)\n baz" + FacetIndexingParams.DEFAULT_FACET_DELIM_CHAR + "foo (1)\n", FacetTestUtils.toSimpleString(results.get(1))); + + searcher.getIndexReader().close(); + dir.close(); + } } Index: lucene/facet/src/test/org/apache/lucene/facet/search/TestDrillSideways.java =================================================================== --- lucene/facet/src/test/org/apache/lucene/facet/search/TestDrillSideways.java (revision 1455276) +++ lucene/facet/src/test/org/apache/lucene/facet/search/TestDrillSideways.java (working copy) @@ -34,6 +34,7 @@ import org.apache.lucene.facet.FacetTestCase; import org.apache.lucene.facet.FacetTestUtils; import org.apache.lucene.facet.index.FacetFields; +import org.apache.lucene.facet.index.SortedSetDocValuesFacetField; import org.apache.lucene.facet.params.FacetIndexingParams; import org.apache.lucene.facet.params.FacetSearchParams; import org.apache.lucene.facet.search.DrillSideways.DrillSidewaysResult; @@ -401,6 +402,8 @@ public void testRandom() throws Exception { + boolean canUseDV = defaultCodecSupportsSortedSet(); + while (aChance == 0.0) { aChance = random().nextDouble(); } @@ -437,7 +440,8 @@ s = _TestUtil.randomRealisticUnicodeString(random()); // We cannot include this character else the label // is silently truncated: - if (s.indexOf(FacetIndexingParams.DEFAULT_FACET_DELIM_CHAR) == -1) { + if (s.indexOf(FacetIndexingParams.DEFAULT_FACET_DELIM_CHAR) == -1 && + (!canUseDV || s.indexOf('/') == -1)) { break; } } @@ -506,24 +510,33 @@ for(int dim=0;dim scores = new HashMap(); for(ScoreDoc sd : hits.scoreDocs) { Index: lucene/facet/src/java/org/apache/lucene/facet/search/SortedSetDocValuesAccumulator.java =================================================================== --- lucene/facet/src/java/org/apache/lucene/facet/search/SortedSetDocValuesAccumulator.java (revision 0) +++ lucene/facet/src/java/org/apache/lucene/facet/search/SortedSetDocValuesAccumulator.java (working copy) @@ -0,0 +1,252 @@ +package org.apache.lucene.facet.search; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import org.apache.lucene.facet.params.CategoryListParams; +import org.apache.lucene.facet.params.FacetSearchParams; +import org.apache.lucene.facet.search.FacetsCollector.MatchingDocs; +import org.apache.lucene.facet.taxonomy.CategoryPath; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.MultiDocValues.MultiSortedSetDocValues; +import org.apache.lucene.index.MultiDocValues; +import org.apache.lucene.index.SortedSetDocValues; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.PriorityQueue; + +/** A {@link FacetsAccumulator} that uses previously + * indexed {@link SortedSetDocValues} to perform faceting, + * without require a separate taxonomy index. Faceting is + * a bit slower (~25%), and there is added cost on every + * {@link IndexReader} open to create a new {@link + * SortedSetDocValuesReaderState}. Furthermore, this does + * not support hierarchical facets; only flat (dimension + + * label) facets, but it uses quite a bit less RAM to do so. */ +public class SortedSetDocValuesAccumulator extends FacetsAccumulator { + + final SortedSetDocValuesReaderState state; + final SortedSetDocValues dv; + final String field; + + public SortedSetDocValuesAccumulator(FacetSearchParams fsp, SortedSetDocValuesReaderState state) throws IOException { + super(fsp, null, null, new FacetArrays((int) state.getDocValues().getValueCount())); + this.state = state; + this.field = state.getField(); + dv = state.getDocValues(); + + // Check params: + for(FacetRequest request : fsp.facetRequests) { + if (!(request instanceof CountFacetRequest)) { + throw new IllegalArgumentException("this collector only supports CountFacetRequest; got " + request); + } + if (request.categoryPath.length != 1) { + throw new IllegalArgumentException("this collector only supports depth 1 CategoryPath; got " + request.categoryPath); + } + if (request.getDepth() != 1) { + throw new IllegalArgumentException("this collector only supports depth=1; got " + request.getDepth()); + } + String dim = request.categoryPath.components[0]; + + SortedSetDocValuesReaderState.OrdRange ordRange = state.getOrdRange(dim); + if (ordRange == null) { + throw new IllegalArgumentException("dim \"" + dim + "\" does not exist"); + } + } + } + + @Override + public FacetsAggregator getAggregator() { + + return new FacetsAggregator() { + + @Override + public void aggregate(MatchingDocs matchingDocs, CategoryListParams clp, FacetArrays facetArrays) throws IOException { + + SortedSetDocValues segValues = matchingDocs.context.reader().getSortedSetDocValues(field); + if (segValues == null) { + return; + } + + final int[] counts = facetArrays.getIntArray(); + final int maxDoc = matchingDocs.context.reader().maxDoc(); + assert maxDoc == matchingDocs.bits.length(); + + if (dv instanceof MultiSortedSetDocValues) { + MultiDocValues.OrdinalMap ordinalMap = ((MultiSortedSetDocValues) dv).mapping; + int segOrd = matchingDocs.context.ord; + + int numSegOrds = (int) segValues.getValueCount(); + + if (matchingDocs.totalHits < numSegOrds/10) { + // Remap every ord to global ord as we iterate: + final int[] segCounts = new int[numSegOrds]; + int doc = 0; + while (doc < maxDoc && (doc = matchingDocs.bits.nextSetBit(doc)) != -1) { + segValues.setDocument(doc); + int term = (int) segValues.nextOrd(); + while (term != SortedSetDocValues.NO_MORE_ORDS) { + counts[(int) ordinalMap.getGlobalOrd(segOrd, term)]++; + term = (int) segValues.nextOrd(); + } + ++doc; + } + } else { + + // First count in seg-ord space: + final int[] segCounts = new int[numSegOrds]; + int doc = 0; + while (doc < maxDoc && (doc = matchingDocs.bits.nextSetBit(doc)) != -1) { + segValues.setDocument(doc); + int term = (int) segValues.nextOrd(); + while (term != SortedSetDocValues.NO_MORE_ORDS) { + segCounts[term]++; + term = (int) segValues.nextOrd(); + } + ++doc; + } + + // Then, migrate to global ords: + for(int ord=0;ord { + public TopCountPQ(int topN) { + super(topN, false); + } + + @Override + protected boolean lessThan(FacetResultNode a, FacetResultNode b) { + if (a.value < b.value) { + return true; + } else if (a.value > b.value) { + return false; + } else { + return a.ordinal > b.ordinal; + } + } + } + + @Override + public List accumulate(List matchingDocs) throws IOException { + + FacetsAggregator aggregator = getAggregator(); + for (CategoryListParams clp : getCategoryLists()) { + for (MatchingDocs md : matchingDocs) { + aggregator.aggregate(md, clp, facetArrays); + } + } + + // compute top-K + + List results = new ArrayList(); + + int[] counts = facetArrays.getIntArray(); + + for(FacetRequest request : searchParams.facetRequests) { + String dim = request.categoryPath.components[0]; + SortedSetDocValuesReaderState.OrdRange ordRange = state.getOrdRange(dim); + // checked in ctor: + assert ordRange != null; + + TopCountPQ q = new TopCountPQ(request.numResults); + + int bottomCount = 0; + + //System.out.println("collect"); + int dimCount = 0; + FacetResultNode reuse = null; + for(int ord=ordRange.start; ord<=ordRange.end; ord++) { + //System.out.println(" ord=" + ord + " count= "+ counts[ord] + " bottomCount=" + bottomCount); + if (counts[ord] > bottomCount) { + dimCount += counts[ord]; + //System.out.println(" keep"); + if (reuse == null) { + reuse = new FacetResultNode(ord, counts[ord]); + } else { + reuse.ordinal = ord; + reuse.value = counts[ord]; + } + reuse = q.insertWithOverflow(reuse); + if (q.size() == request.numResults) { + bottomCount = (int) q.top().value; + //System.out.println(" new bottom=" + bottomCount); + } + } + } + + CategoryListParams.OrdinalPolicy op = searchParams.indexingParams.getCategoryListParams(request.categoryPath).getOrdinalPolicy(dim); + if (op == CategoryListParams.OrdinalPolicy.ALL_BUT_DIMENSION) { + dimCount = 0; + } + + FacetResultNode rootNode = new FacetResultNode(-1, dimCount); + rootNode.label = new CategoryPath(new String[] {dim}); + + FacetResultNode[] childNodes = new FacetResultNode[q.size()]; + BytesRef scratch = new BytesRef(); + for(int i=childNodes.length-1;i>=0;i--) { + childNodes[i] = q.pop(); + dv.lookupOrd(childNodes[i].ordinal, scratch); + childNodes[i].label = new CategoryPath(scratch.utf8ToString().split(state.separatorRegex, 2)); + } + rootNode.subResults = Arrays.asList(childNodes); + + results.add(new FacetResult(request, rootNode, childNodes.length)); + } + + return results; + } +} Property changes on: lucene/facet/src/java/org/apache/lucene/facet/search/SortedSetDocValuesAccumulator.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/facet/src/java/org/apache/lucene/facet/search/SortedSetDocValuesReaderState.java =================================================================== --- lucene/facet/src/java/org/apache/lucene/facet/search/SortedSetDocValuesReaderState.java (revision 0) +++ lucene/facet/src/java/org/apache/lucene/facet/search/SortedSetDocValuesReaderState.java (working copy) @@ -0,0 +1,150 @@ +package org.apache.lucene.facet.search; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; +import java.util.regex.Pattern; + +import org.apache.lucene.facet.params.FacetIndexingParams; +import org.apache.lucene.index.AtomicReader; +import org.apache.lucene.index.CompositeReader; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.SlowCompositeReaderWrapper; +import org.apache.lucene.index.SortedSetDocValues; +import org.apache.lucene.util.BytesRef; + +/** Wraps a {@link IndexReader} and resolves ords + * using existing {@link SortedSetDocValues} APIs without a + * separate taxonomy index. This only supports flat facets + * (dimension + label), and it makes faceting a bit + * slower, adds some cost at reopen time, but avoids + * managing the separate taxonomy index. It also requires + * less RAM than the taxonomy index, as it manages the flat + * (2-level) hierarchy more efficiently. In addition, the + * tie-break during faceting is now meaningful (in label + * sorted order). + * + *

NOTE: creating an instance of this class is + * somewhat costly, as it computes per-segment ordinal maps, + * so you should create it once and re-use that one instance + * for a given {@link IndexReader}. */ + +public final class SortedSetDocValuesReaderState { + + private final String field; + private final AtomicReader topReader; + private final int valueCount; + final char separator; + final String separatorRegex; + + /** Holds start/end range of ords, which maps to one + * dimension (someday we may generalize it to map to + * hierarchies within one dimension). */ + static final class OrdRange { + public final int start; + public final int end; + + /** Start and end are inclusive. */ + public OrdRange(int start, int end) { + this.start = start; + this.end = end; + } + } + + private final Map prefixToOrdRange = new HashMap(); + + /** Create an instance, scanning the {@link + * SortedSetDocValues} from the provided reader, with + * default {@link FacetIndexingParams}. */ + public SortedSetDocValuesReaderState(IndexReader reader) throws IOException { + this(reader, FacetIndexingParams.DEFAULT); + } + + /** Create an instance, scanning the {@link + * SortedSetDocValues} from the provided reader and + * {@link FacetIndexingParams}. */ + public SortedSetDocValuesReaderState(IndexReader reader, FacetIndexingParams fip) throws IOException { + + this.field = fip.getCategoryListParams(null).field + "sdv"; + this.separator = FacetIndexingParams.DEFAULT_FACET_DELIM_CHAR; + this.separatorRegex = Pattern.quote(Character.toString(separator)); + + // We need this to create thread-safe MultiSortedSetDV + // per collector: + if (reader instanceof AtomicReader) { + topReader = (AtomicReader) reader; + } else { + topReader = new SlowCompositeReaderWrapper((CompositeReader) reader); + } + SortedSetDocValues dv = topReader.getSortedSetDocValues(field); + if (dv == null) { + throw new IllegalArgumentException("field \"" + field + "\" was not indexed with SortedSetDocValues"); + } + if (dv.getValueCount() > Integer.MAX_VALUE) { + throw new IllegalArgumentException("can only handle valueCount < Integer.MAX_VALUE; got " + dv.getValueCount()); + } + valueCount = (int) dv.getValueCount(); + + // TODO: we can make this more efficient if eg we can be + // "involved" when OrdinalMap is being created? Ie see + // each term/ord it's assigning as it goes... + String lastDim = null; + int startOrd = -1; + BytesRef spare = new BytesRef(); + + // TODO: this approach can work for full hierarchy?; + // TaxoReader can't do this since ords are not in + // "sorted order" ... but we should generalize this to + // support arbitrary hierarchy: + for(int ord=0;ord