Index: lucene/facet/src/test/org/apache/lucene/facet/sortedset/TestSortedSetDocValuesFacets.java =================================================================== --- lucene/facet/src/test/org/apache/lucene/facet/sortedset/TestSortedSetDocValuesFacets.java (revision 1540424) +++ lucene/facet/src/test/org/apache/lucene/facet/sortedset/TestSortedSetDocValuesFacets.java (working copy) @@ -193,4 +193,62 @@ searcher.getIndexReader().close(); dir.close(); } + + // LUCENE-5333 + public void testSparseFacets() throws Exception { + assumeTrue("Test requires SortedSetDV support", defaultCodecSupportsSortedSet()); + Directory dir = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random(), dir); + + SortedSetDocValuesFacetFields dvFields = new SortedSetDocValuesFacetFields(); + + Document doc = new Document(); + List paths = new ArrayList(); + paths.add(new CategoryPath("a", "foo1")); + dvFields.addFields(doc, paths); + writer.addDocument(doc); + + if (random().nextBoolean()) { + writer.commit(); + } + + doc = new Document(); + paths = new ArrayList(); + paths.add(new CategoryPath("a", "foo2")); + paths.add(new CategoryPath("b", "bar1")); + dvFields.addFields(doc, paths); + writer.addDocument(doc); + + if (random().nextBoolean()) { + writer.commit(); + } + + doc = new Document(); + paths = new ArrayList(); + paths.add(new CategoryPath("a", "foo3")); + paths.add(new CategoryPath("b", "bar2")); + paths.add(new CategoryPath("c", "baz1")); + dvFields.addFields(doc, paths); + writer.addDocument(doc); + + // NRT open + IndexSearcher searcher = newSearcher(writer.getReader()); + writer.close(); + + // Per-top-reader state: + SortedSetDocValuesReaderState state = new SortedSetDocValuesReaderState(searcher.getIndexReader()); + + // Ask for top 10 labels for all dims that have counts: + FacetsCollector c = FacetsCollector.create(new SortedSetDocValuesAccumulator(state, 10)); + searcher.search(new MatchAllDocsQuery(), c); + List results = c.getFacetResults(); + + assertEquals(3, results.size()); + assertEquals("a (3)\n foo1 (1)\n foo2 (1)\n foo3 (1)\n", FacetTestUtils.toSimpleString(results.get(0))); + assertEquals("b (2)\n bar1 (1)\n bar2 (1)\n", FacetTestUtils.toSimpleString(results.get(1))); + assertEquals("c (1)\n baz1 (1)\n", FacetTestUtils.toSimpleString(results.get(2))); + + searcher.getIndexReader().close(); + dir.close(); + } } Index: lucene/facet/src/java/org/apache/lucene/facet/sortedset/SortedSetDocValuesAccumulator.java =================================================================== --- lucene/facet/src/java/org/apache/lucene/facet/sortedset/SortedSetDocValuesAccumulator.java (revision 1540424) +++ lucene/facet/src/java/org/apache/lucene/facet/sortedset/SortedSetDocValuesAccumulator.java (working copy) @@ -22,6 +22,7 @@ import java.util.Arrays; import java.util.Collections; import java.util.Comparator; +import java.util.Iterator; import java.util.List; import org.apache.lucene.facet.params.CategoryListParams; @@ -36,8 +37,8 @@ import org.apache.lucene.facet.taxonomy.CategoryPath; import org.apache.lucene.index.AtomicReader; import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.MultiDocValues.MultiSortedSetDocValues; import org.apache.lucene.index.MultiDocValues; -import org.apache.lucene.index.MultiDocValues.MultiSortedSetDocValues; import org.apache.lucene.index.ReaderUtil; import org.apache.lucene.index.SortedSetDocValues; import org.apache.lucene.util.BytesRef; @@ -57,37 +58,62 @@ final SortedSetDocValues dv; final String field; final FacetArrays facetArrays; - - /** Constructor with the given facet search params. */ + final int topCount; + + /** Sparse faceting: returns any dimension that had any + * hits, topCount labels per dimension. */ + public SortedSetDocValuesAccumulator(SortedSetDocValuesReaderState state, int topCount) + throws IOException { + this(state, null, null, topCount); + } + + /** Non-sparse faceting: returns the specific + * {@link FacetRequest}s */ public SortedSetDocValuesAccumulator(SortedSetDocValuesReaderState state, FacetSearchParams fsp) throws IOException { - this(state, fsp, null); + this(state, fsp, null, -1); } public SortedSetDocValuesAccumulator(SortedSetDocValuesReaderState state, FacetSearchParams fsp, FacetArrays arrays) throws IOException { + this(state, fsp, arrays, -1); + } + + private SortedSetDocValuesAccumulator(SortedSetDocValuesReaderState state, FacetSearchParams fsp, FacetArrays arrays, int topCount) + throws IOException { super(fsp); this.state = state; this.field = state.getField(); this.facetArrays = arrays == null ? new FacetArrays(state.getSize()) : arrays; dv = state.getDocValues(); + this.topCount = topCount; - // Check params: - for (FacetRequest fr : fsp.facetRequests) { - if (!(fr instanceof CountFacetRequest)) { - throw new IllegalArgumentException("this accumulator only supports CountFacetRequest; got " + fr); + if (fsp != null) { + + assert topCount == -1; + + if (fsp.facetRequests.size() == 0) { + throw new IllegalArgumentException("must provide at least one FacetRequest (got 0)"); } - if (fr.categoryPath.length != 1) { - throw new IllegalArgumentException("this accumulator only supports 1-level CategoryPath; got " + fr.categoryPath); - } - if (fr.getDepth() != 1) { - throw new IllegalArgumentException("this accumulator only supports depth=1; got " + fr.getDepth()); - } - String dim = fr.categoryPath.components[0]; - SortedSetDocValuesReaderState.OrdRange ordRange = state.getOrdRange(dim); - if (ordRange == null) { - throw new IllegalArgumentException("dim \"" + dim + "\" does not exist"); + // Check params: + for (FacetRequest fr : fsp.facetRequests) { + if (!(fr instanceof CountFacetRequest)) { + throw new IllegalArgumentException("this accumulator only supports CountFacetRequest; got " + fr); + } + if (fr.getDepth() != 1) { + throw new IllegalArgumentException("this accumulator only supports depth=1; got " + fr.getDepth()); + } + + if (fr.categoryPath.length != 1) { + throw new IllegalArgumentException("this accumulator only supports 1-level CategoryPath; got " + fr.categoryPath); + } + String dim = fr.categoryPath.components[0]; + + SortedSetDocValuesReaderState.OrdRange ordRange = state.getOrdRange(dim); + if (ordRange == null) { + throw new IllegalArgumentException("dim \"" + dim + "\" does not exist"); + } } } } @@ -211,6 +237,37 @@ aggregator.aggregate(md, facetArrays); } + boolean isSparse = searchParams == null; + + Iterator requests; + + if (isSparse) { + // We count all dims but only return those that have + // count > 0: + + final Iterator dims = state.getPrefixToOrdRange().keySet().iterator(); + + requests = new Iterator() { + + @Override + public boolean hasNext() { + return dims.hasNext(); + } + + @Override + public FacetRequest next() { + return new CountFacetRequest(new CategoryPath(dims.next()), topCount); + } + + @Override + public void remove() { + throw new UnsupportedOperationException(); + } + }; + } else { + requests = searchParams.facetRequests.iterator(); + } + // compute top-K List results = new ArrayList(); @@ -218,7 +275,8 @@ BytesRef scratch = new BytesRef(); - for (FacetRequest request : searchParams.facetRequests) { + while (requests.hasNext()) { + FacetRequest request = requests.next(); String dim = request.categoryPath.components[0]; SortedSetDocValuesReaderState.OrdRange ordRange = state.getOrdRange(dim); // checked in ctor: @@ -252,9 +310,15 @@ } }); - CategoryListParams.OrdinalPolicy op = searchParams.indexingParams.getCategoryListParams(request.categoryPath).getOrdinalPolicy(dim); - if (op == CategoryListParams.OrdinalPolicy.ALL_BUT_DIMENSION) { - dimCount = 0; + if (isSparse) { + if (dimCount == 0) { + continue; + } + } else { + CategoryListParams.OrdinalPolicy op = searchParams.indexingParams.getCategoryListParams(request.categoryPath).getOrdinalPolicy(dim); + if (op == CategoryListParams.OrdinalPolicy.ALL_BUT_DIMENSION) { + dimCount = 0; + } } FacetResultNode rootNode = new FacetResultNode(-1, dimCount); @@ -294,9 +358,15 @@ } } - CategoryListParams.OrdinalPolicy op = searchParams.indexingParams.getCategoryListParams(request.categoryPath).getOrdinalPolicy(dim); - if (op == CategoryListParams.OrdinalPolicy.ALL_BUT_DIMENSION) { - dimCount = 0; + if (isSparse) { + if (dimCount == 0) { + continue; + } + } else { + CategoryListParams.OrdinalPolicy op = searchParams.indexingParams.getCategoryListParams(request.categoryPath).getOrdinalPolicy(dim); + if (op == CategoryListParams.OrdinalPolicy.ALL_BUT_DIMENSION) { + dimCount = 0; + } } FacetResultNode rootNode = new FacetResultNode(-1, dimCount); @@ -313,6 +383,28 @@ results.add(new FacetResult(request, rootNode, childCount)); } + if (isSparse) { + // Sort by highest count: + Collections.sort(results, + new Comparator() { + @Override + public int compare(FacetResult a, FacetResult b) { + FacetResultNode aRoot = a.getFacetResultNode(); + FacetResultNode bRoot = b.getFacetResultNode(); + int countA = (int) aRoot.value; + int countB = (int) bRoot.value; + if (countA > countB) { + return -1; + } else if (countB > countA) { + return 1; + } else { + // Tie break by dimension + return aRoot.label.components[0].compareTo(bRoot.label.components[0]); + } + } + }); + } + return results; } @@ -320,5 +412,4 @@ public boolean requiresDocScores() { return false; } - } Index: lucene/facet/src/java/org/apache/lucene/facet/sortedset/SortedSetDocValuesReaderState.java =================================================================== --- lucene/facet/src/java/org/apache/lucene/facet/sortedset/SortedSetDocValuesReaderState.java (revision 1540424) +++ lucene/facet/src/java/org/apache/lucene/facet/sortedset/SortedSetDocValuesReaderState.java (working copy) @@ -145,6 +145,10 @@ return prefixToOrdRange.get(dim); } + Map getPrefixToOrdRange() { + return prefixToOrdRange; + } + String getField() { return field; }