Index: lucene/facet/src/test/org/apache/lucene/facet/search/TestDemoFacets.java =================================================================== --- lucene/facet/src/test/org/apache/lucene/facet/search/TestDemoFacets.java (revision 1451890) +++ lucene/facet/src/test/org/apache/lucene/facet/search/TestDemoFacets.java (working copy) @@ -27,6 +27,7 @@ import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; +import org.apache.lucene.document.SortedSetDocValuesField; import org.apache.lucene.facet.FacetTestCase; import org.apache.lucene.facet.FacetTestUtils; import org.apache.lucene.facet.index.FacetFields; @@ -243,4 +244,68 @@ dir.close(); taxoDir.close(); } + + public void testSortedSetDocValuesCollector() throws Exception { + assumeTrue("Test requires SortedSetDV support", defaultCodecSupportsSortedSet()); + + Directory dir = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random(), dir); + + Document doc = new Document(); + doc.add(new SortedSetDocValuesField("myfacets", new BytesRef("a/foo"))); + doc.add(new SortedSetDocValuesField("myfacets", new BytesRef("a/bar"))); + doc.add(new SortedSetDocValuesField("myfacets", new BytesRef("a/zoo"))); + doc.add(new SortedSetDocValuesField("myfacets", new BytesRef("b/baz"))); + writer.addDocument(doc); + if (random().nextBoolean()) { + writer.commit(); + } + + doc = new Document(); + doc.add(new SortedSetDocValuesField("myfacets", new BytesRef("a/foo"))); + writer.addDocument(doc); + + // NRT open + IndexSearcher searcher = newSearcher(writer.getReader()); + writer.close(); + + List requests = new ArrayList(); + requests.add(new CountFacetRequest(new CategoryPath("a", '/'), 10)); + requests.add(new CountFacetRequest(new CategoryPath("b", '/'), 10)); + + final boolean doDimCount = random().nextBoolean(); + + CategoryListParams clp = new CategoryListParams() { + @Override + public OrdinalPolicy getOrdinalPolicy(String dimension) { + return doDimCount ? OrdinalPolicy.NO_PARENTS : OrdinalPolicy.ALL_BUT_DIMENSION; + } + }; + + FacetSearchParams fsp = new FacetSearchParams(new FacetIndexingParams(clp), requests); + + // Per-top-reader state: + SortedSetDocValuesReaderState state = new SortedSetDocValuesReaderState(searcher.getIndexReader(), "myfacets"); + + //SortedSetDocValuesCollector c = new SortedSetDocValuesCollector(state); + //SortedSetDocValuesCollectorMergeBySeg c = new SortedSetDocValuesCollectorMergeBySeg(state); + + FacetsCollector c = FacetsCollector.create(new SortedSetDocValuesAccumulator(fsp, state)); + + searcher.search(new MatchAllDocsQuery(), c); + + //List results = c.getFacetResults(requests); + List results = c.getFacetResults(); + + assertEquals(2, results.size()); + + int dimCount = doDimCount ? 4 : 0; + assertEquals("a (" + dimCount + ")\n foo (2)\n bar (1)\n zoo (1)\n", FacetTestUtils.toSimpleString(results.get(0))); + + dimCount = doDimCount ? 1 : 0; + assertEquals("b (" + dimCount + ")\n baz (1)\n", FacetTestUtils.toSimpleString(results.get(1))); + + searcher.getIndexReader().close(); + dir.close(); + } } Index: lucene/facet/src/test/org/apache/lucene/facet/search/TestDrillSideways.java =================================================================== --- lucene/facet/src/test/org/apache/lucene/facet/search/TestDrillSideways.java (revision 1451890) +++ lucene/facet/src/test/org/apache/lucene/facet/search/TestDrillSideways.java (working copy) @@ -30,6 +30,7 @@ import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; +import org.apache.lucene.document.SortedSetDocValuesField; import org.apache.lucene.document.StringField; import org.apache.lucene.facet.FacetTestCase; import org.apache.lucene.facet.index.FacetFields; @@ -345,6 +346,8 @@ public void testRandom() throws Exception { + boolean canUseDV = defaultCodecSupportsSortedSet(); + while (aChance == 0.0) { aChance = random().nextDouble(); } @@ -381,7 +384,8 @@ s = _TestUtil.randomRealisticUnicodeString(random()); // We cannot include this character else the label // is silently truncated: - if (s.indexOf(FacetIndexingParams.DEFAULT_FACET_DELIM_CHAR) == -1) { + if (s.indexOf(FacetIndexingParams.DEFAULT_FACET_DELIM_CHAR) == -1 && + (!canUseDV || s.indexOf('/') == -1)) { break; } } @@ -455,6 +459,9 @@ if (VERBOSE) { System.out.println(" dim" + dim + "=" + new BytesRef(dimValues[dim][dimValue])); } + if (canUseDV) { + doc.add(new SortedSetDocValuesField("dvFacets", new BytesRef("dim" + dim + "/" + dimValues[dim][dimValue]))); + } } int dimValue2 = rawDoc.dims2[dim]; if (dimValue2 != -1) { @@ -463,11 +470,15 @@ if (VERBOSE) { System.out.println(" dim" + dim + "=" + new BytesRef(dimValues[dim][dimValue2])); } + if (canUseDV) { + doc.add(new SortedSetDocValuesField("dvFacets", new BytesRef("dim" + dim + "/" + dimValues[dim][dimValue2]))); + } } } if (!paths.isEmpty()) { facetFields.addFields(doc, paths); } + w.addDocument(doc); } @@ -499,6 +510,14 @@ } IndexReader r = w.getReader(); w.close(); + + final SortedSetDocValuesReaderState sortedSetDVState; + if (canUseDV) { + sortedSetDVState = new SortedSetDocValuesReaderState(r, "dvFacets"); + } else { + sortedSetDVState = null; + } + if (VERBOSE) { System.out.println("r.numDocs() = " + r.numDocs()); } @@ -633,8 +652,28 @@ SimpleFacetResult expected = slowDrillSidewaysSearch(s, docs, contentToken, drillDowns, dimValues, filter); Sort sort = new Sort(new SortField("id", SortField.Type.STRING)); - DrillSidewaysResult actual = new DrillSideways(s, tr).search(ddq, filter, null, numDocs, sort, true, true, fsp); + DrillSideways ds; + if (canUseDV && random().nextBoolean()) { + if (VERBOSE) { + System.out.println(" use SortedSetDV"); + } + ds = new DrillSideways(s, null) { + @Override + protected FacetsAccumulator getDrillDownAccumulator(FacetSearchParams fsp) throws IOException { + return new SortedSetDocValuesAccumulator(fsp, sortedSetDVState); + } + @Override + protected FacetsAccumulator getDrillSidewaysAccumulator(String dim, FacetSearchParams fsp) throws IOException { + return new SortedSetDocValuesAccumulator(fsp, sortedSetDVState); + } + }; + } else { + ds = new DrillSideways(s, tr); + } + + DrillSidewaysResult actual = ds.search(ddq, filter, null, numDocs, sort, true, true, fsp); + TopDocs hits = s.search(baseQuery, numDocs); Map scores = new HashMap(); for(ScoreDoc sd : hits.scoreDocs) { Index: lucene/facet/src/java/org/apache/lucene/facet/search/DrillSideways.java =================================================================== --- lucene/facet/src/java/org/apache/lucene/facet/search/DrillSideways.java (revision 1451890) +++ lucene/facet/src/java/org/apache/lucene/facet/search/DrillSideways.java (working copy) @@ -192,7 +192,7 @@ doDocScores, doMaxScore, true); - DrillSidewaysResult r = new DrillSideways(searcher, taxoReader).search(query, hitCollector, fsp); + DrillSidewaysResult r = search(query, hitCollector, fsp); r.hits = hitCollector.topDocs(); return r; } else { @@ -207,20 +207,20 @@ public DrillSidewaysResult search(ScoreDoc after, DrillDownQuery query, int topN, FacetSearchParams fsp) throws IOException { TopScoreDocCollector hitCollector = TopScoreDocCollector.create(Math.min(topN, searcher.getIndexReader().maxDoc()), after, true); - DrillSidewaysResult r = new DrillSideways(searcher, taxoReader).search(query, hitCollector, fsp); + DrillSidewaysResult r = search(query, hitCollector, fsp); r.hits = hitCollector.topDocs(); return r; } /** Override this to use a custom drill-down {@link * FacetsAccumulator}. */ - protected FacetsAccumulator getDrillDownAccumulator(FacetSearchParams fsp) { + protected FacetsAccumulator getDrillDownAccumulator(FacetSearchParams fsp) throws IOException { return FacetsAccumulator.create(fsp, searcher.getIndexReader(), taxoReader); } /** Override this to use a custom drill-sideways {@link * FacetsAccumulator}. */ - protected FacetsAccumulator getDrillSidewaysAccumulator(String dim, FacetSearchParams fsp) { + protected FacetsAccumulator getDrillSidewaysAccumulator(String dim, FacetSearchParams fsp) throws IOException { return FacetsAccumulator.create(fsp, searcher.getIndexReader(), taxoReader); } Index: lucene/facet/src/java/org/apache/lucene/facet/search/SortedSetDocValuesAccumulator.java =================================================================== --- lucene/facet/src/java/org/apache/lucene/facet/search/SortedSetDocValuesAccumulator.java (revision 0) +++ lucene/facet/src/java/org/apache/lucene/facet/search/SortedSetDocValuesAccumulator.java (working copy) @@ -0,0 +1,218 @@ +package org.apache.lucene.facet.search; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import org.apache.lucene.facet.params.CategoryListParams; +import org.apache.lucene.facet.params.FacetSearchParams; +import org.apache.lucene.facet.search.FacetsCollector.MatchingDocs; +import org.apache.lucene.facet.taxonomy.CategoryPath; +import org.apache.lucene.index.MultiDocValues.MultiSortedSetDocValues; +import org.apache.lucene.index.MultiDocValues; +import org.apache.lucene.index.SortedSetDocValues; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.packed.MonotonicAppendingLongBuffer; + +public class SortedSetDocValuesAccumulator extends FacetsAccumulator { + + final SortedSetDocValuesReaderState state; + final SortedSetDocValues dv; + final String field; + + public SortedSetDocValuesAccumulator(FacetSearchParams fsp, SortedSetDocValuesReaderState state) throws IOException { + super(fsp, null, null, new FacetArrays((int) state.getDocValues().getValueCount())); + this.state = state; + this.field = state.getField(); + dv = state.getDocValues(); + + // Check params: + for(FacetRequest request : fsp.facetRequests) { + if (!(request instanceof CountFacetRequest)) { + throw new IllegalArgumentException("this collector only supports CountFacetRequest; got " + request); + } + if (request.categoryPath.length != 1) { + throw new IllegalArgumentException("this collector only supports depth 1 CategoryPath; got " + request.categoryPath); + } + String dim = request.categoryPath.components[0]; + + SortedSetDocValuesReaderState.OrdRange ordRange = state.getOrdRange(dim); + if (ordRange == null) { + throw new IllegalArgumentException("dim \"" + dim + "\" does not exist"); + } + } + } + + @Override + public FacetsAggregator getAggregator() { + + return new FacetsAggregator() { + + @Override + public void aggregate(MatchingDocs matchingDocs, CategoryListParams clp, FacetArrays facetArrays) throws IOException { + + SortedSetDocValues segValues = matchingDocs.context.reader().getSortedSetDocValues(field); + if (segValues == null) { + return; + } + + final int[] counts = facetArrays.getIntArray(); + final int maxDoc = matchingDocs.context.reader().maxDoc(); + assert maxDoc == matchingDocs.bits.length(); + + if (dv instanceof MultiSortedSetDocValues) { + MultiDocValues.OrdinalMap ordinalMap = ((MultiSortedSetDocValues) dv).mapping; + // nocommit is context.ord really right...? + int segOrd = matchingDocs.context.ord; + + final MonotonicAppendingLongBuffer subMap = ordinalMap.getSub(segOrd); + + int numSegOrds = (int) segValues.getValueCount(); + + if (matchingDocs.totalHits < numSegOrds/10) { + // Remap every ord to global ord as we iterate: + final int[] segCounts = new int[numSegOrds]; + int doc = 0; + while (doc < maxDoc && (doc = matchingDocs.bits.nextSetBit(doc)) != -1) { + segValues.setDocument(doc); + int term = (int) segValues.nextOrd(); + while (term != SortedSetDocValues.NO_MORE_ORDS) { + counts[term + (int) subMap.get(term)]++; + term = (int) segValues.nextOrd(); + } + ++doc; + } + } else { + + // First count in seg-ord space: + final int[] segCounts = new int[numSegOrds]; + int doc = 0; + while (doc < maxDoc && (doc = matchingDocs.bits.nextSetBit(doc)) != -1) { + segValues.setDocument(doc); + int term = (int) segValues.nextOrd(); + while (term != SortedSetDocValues.NO_MORE_ORDS) { + segCounts[term]++; + term = (int) segValues.nextOrd(); + } + ++doc; + } + + // Then, migrate to global ords: + for(int ord=0;ord accumulate(List matchingDocs) throws IOException { + + FacetsAggregator aggregator = getAggregator(); + for (CategoryListParams clp : getCategoryLists()) { + for (MatchingDocs md : matchingDocs) { + aggregator.aggregate(md, clp, facetArrays); + } + } + + // compute top-K + + List results = new ArrayList(); + + int[] counts = facetArrays.getIntArray(); + + for(FacetRequest request : searchParams.facetRequests) { + String dim = request.categoryPath.components[0]; + SortedSetDocValuesReaderState.OrdRange ordRange = state.getOrdRange(dim); + // checked in ctor: + assert ordRange != null; + + SortedSetDocValuesCollector.TopCountPQ q = new SortedSetDocValuesCollector.TopCountPQ(request.numResults); + + int bottomCount = 0; + + //System.out.println("collect"); + int dimCount = 0; + for(int ord=ordRange.start; ord<=ordRange.end; ord++) { + //System.out.println(" ord=" + ord + " count= "+ counts[ord] + " bottomCount=" + bottomCount); + if (counts[ord] > bottomCount) { + dimCount += counts[ord]; + //System.out.println(" keep"); + q.insertWithOverflow(new FacetResultNode(ord, counts[ord])); + if (q.size() == request.numResults) { + bottomCount = (int) q.top().value; + //System.out.println(" new bottom=" + bottomCount); + } + } + } + + CategoryListParams.OrdinalPolicy op = searchParams.indexingParams.getCategoryListParams(request.categoryPath).getOrdinalPolicy(dim); + if (op == CategoryListParams.OrdinalPolicy.ALL_BUT_DIMENSION) { + dimCount = 0; + } + + FacetResultNode rootNode = new FacetResultNode(-1, dimCount); + rootNode.label = new CategoryPath(new String[] {dim}); + + FacetResultNode[] childNodes = new FacetResultNode[q.size()]; + BytesRef scratch = new BytesRef(); + for(int i=childNodes.length-1;i>=0;i--) { + childNodes[i] = q.pop(); + dv.lookupOrd(childNodes[i].ordinal, scratch); + childNodes[i].label = new CategoryPath(scratch.utf8ToString(), '/'); + } + rootNode.subResults = Arrays.asList(childNodes); + + results.add(new FacetResult(request, rootNode, childNodes.length)); + } + + return results; + } +} Property changes on: lucene/facet/src/java/org/apache/lucene/facet/search/SortedSetDocValuesAccumulator.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/facet/src/java/org/apache/lucene/facet/search/SortedSetDocValuesReaderState.java =================================================================== --- lucene/facet/src/java/org/apache/lucene/facet/search/SortedSetDocValuesReaderState.java (revision 0) +++ lucene/facet/src/java/org/apache/lucene/facet/search/SortedSetDocValuesReaderState.java (working copy) @@ -0,0 +1,131 @@ +package org.apache.lucene.facet.search; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; + +import org.apache.lucene.index.AtomicReader; +import org.apache.lucene.index.CompositeReader; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.SlowCompositeReaderWrapper; +import org.apache.lucene.index.SortedSetDocValues; +import org.apache.lucene.util.BytesRef; + +// nocommit make sep controllable + +// nocommit explain that sep must be / +// ... but explain that hierarchy doesn't quite "work" + +/** Wraps a {@link IndexReader} and resolves ords + * using existing {@link SortedSetDocValues} APIs without a + * separate taxonomy index. This makes faceting a bit + * slower but avoids managing the separate taxonomy index. + * + * NOTE: creating an instance of this class is + * somewhat costly, as it computes per-segment ordinal maps, + * so you should create it once and re-use that one instance + * for a given {@link IndexReader}. + * +*/ + +public final class SortedSetDocValuesReaderState { + + private final String field; + private final AtomicReader topReader; + private final int valueCount; + + public static final class OrdRange { + public final int start; + public final int end; + + /** Start and end are inclusive. */ + public OrdRange(int start, int end) { + this.start = start; + this.end = end; + } + } + + private final Map prefixToOrdRange = new HashMap(); + + public SortedSetDocValuesReaderState(IndexReader reader, String field) throws IOException { + this.field = field; + // We need this to create thread-safe MultiSortedSetDV + // per collector: + if (reader instanceof AtomicReader) { + topReader = (AtomicReader) reader; + } else { + topReader = new SlowCompositeReaderWrapper((CompositeReader) reader); + } + SortedSetDocValues dv = topReader.getSortedSetDocValues(field); + if (dv == null) { + throw new IllegalArgumentException("field \"" + field + "\" was not indexed with SortedSetDocValues"); + } + if (dv.getValueCount() > Integer.MAX_VALUE) { + throw new IllegalArgumentException("can only handle valueCount < Integer.MAX_VALUE; got " + dv.getValueCount()); + } + valueCount = (int) dv.getValueCount(); + + // TODO: wasteful that we must allocate 3 int[maxOrd]! + // Can we cutover to methods instead? We can have a + // much more RAM-efficient impl... + + // TODO: we can make this more efficient if eg we can be + // "involved" when OrdinalMap is being created? Ie see + // each term/ord it's assigning as it goes... + String lastDim = null; + int startOrd = -1; + BytesRef spare = new BytesRef(); + // TODO: this approach can work for full hierarchy?; generalize it: + for(int ord=0;ord