Index: lucene/facet/src/test/org/apache/lucene/facet/search/TestDemoFacets.java =================================================================== --- lucene/facet/src/test/org/apache/lucene/facet/search/TestDemoFacets.java (revision 1455276) +++ lucene/facet/src/test/org/apache/lucene/facet/search/TestDemoFacets.java (working copy) @@ -27,6 +27,7 @@ import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; +import org.apache.lucene.document.SortedSetDocValuesField; import org.apache.lucene.facet.FacetTestCase; import org.apache.lucene.facet.FacetTestUtils; import org.apache.lucene.facet.index.FacetFields; @@ -243,4 +244,71 @@ dir.close(); taxoDir.close(); } + + public void testSortedSetDocValuesAccumulator() throws Exception { + assumeTrue("Test requires SortedSetDV support", defaultCodecSupportsSortedSet()); + + for(char sep : new char[] {'/', ':', '\\'}) { + + Directory dir = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random(), dir); + + Document doc = new Document(); + doc.add(new SortedSetDocValuesField("myfacets", new BytesRef("a" + sep + "foo"))); + doc.add(new SortedSetDocValuesField("myfacets", new BytesRef("a" + sep + "bar"))); + doc.add(new SortedSetDocValuesField("myfacets", new BytesRef("a" + sep + "zoo"))); + doc.add(new SortedSetDocValuesField("myfacets", new BytesRef("b" + sep + "baz"))); + writer.addDocument(doc); + if (random().nextBoolean()) { + writer.commit(); + } + + doc = new Document(); + doc.add(new SortedSetDocValuesField("myfacets", new BytesRef("a" + sep + "foo"))); + writer.addDocument(doc); + + // NRT open + IndexSearcher searcher = newSearcher(writer.getReader()); + writer.close(); + + List requests = new ArrayList(); + requests.add(new CountFacetRequest(new CategoryPath("a", sep), 10)); + requests.add(new CountFacetRequest(new CategoryPath("b", sep), 10)); + + final boolean doDimCount = random().nextBoolean(); + + CategoryListParams clp = new CategoryListParams() { + @Override + public OrdinalPolicy getOrdinalPolicy(String dimension) { + return doDimCount ? OrdinalPolicy.NO_PARENTS : OrdinalPolicy.ALL_BUT_DIMENSION; + } + }; + + FacetSearchParams fsp = new FacetSearchParams(new FacetIndexingParams(clp), requests); + + // Per-top-reader state: + SortedSetDocValuesReaderState state = new SortedSetDocValuesReaderState(searcher.getIndexReader(), "myfacets", sep); + + //SortedSetDocValuesCollector c = new SortedSetDocValuesCollector(state); + //SortedSetDocValuesCollectorMergeBySeg c = new SortedSetDocValuesCollectorMergeBySeg(state); + + FacetsCollector c = FacetsCollector.create(new SortedSetDocValuesAccumulator(fsp, state)); + + searcher.search(new MatchAllDocsQuery(), c); + + //List results = c.getFacetResults(requests); + List results = c.getFacetResults(); + + assertEquals(2, results.size()); + + int dimCount = doDimCount ? 4 : 0; + assertEquals("a (" + dimCount + ")\n foo (2)\n bar (1)\n zoo (1)\n", FacetTestUtils.toSimpleString(results.get(0))); + + dimCount = doDimCount ? 1 : 0; + assertEquals("b (" + dimCount + ")\n baz (1)\n", FacetTestUtils.toSimpleString(results.get(1))); + + searcher.getIndexReader().close(); + dir.close(); + } + } } Index: lucene/facet/src/test/org/apache/lucene/facet/search/TestDrillSideways.java =================================================================== --- lucene/facet/src/test/org/apache/lucene/facet/search/TestDrillSideways.java (revision 1455276) +++ lucene/facet/src/test/org/apache/lucene/facet/search/TestDrillSideways.java (working copy) @@ -30,6 +30,7 @@ import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; +import org.apache.lucene.document.SortedSetDocValuesField; import org.apache.lucene.document.StringField; import org.apache.lucene.facet.FacetTestCase; import org.apache.lucene.facet.FacetTestUtils; @@ -401,6 +402,8 @@ public void testRandom() throws Exception { + boolean canUseDV = defaultCodecSupportsSortedSet(); + while (aChance == 0.0) { aChance = random().nextDouble(); } @@ -437,7 +440,8 @@ s = _TestUtil.randomRealisticUnicodeString(random()); // We cannot include this character else the label // is silently truncated: - if (s.indexOf(FacetIndexingParams.DEFAULT_FACET_DELIM_CHAR) == -1) { + if (s.indexOf(FacetIndexingParams.DEFAULT_FACET_DELIM_CHAR) == -1 && + (!canUseDV || s.indexOf('/') == -1)) { break; } } @@ -511,6 +515,9 @@ if (VERBOSE) { System.out.println(" dim" + dim + "=" + new BytesRef(dimValues[dim][dimValue])); } + if (canUseDV) { + doc.add(new SortedSetDocValuesField("dvFacets", new BytesRef("dim" + dim + "/" + dimValues[dim][dimValue]))); + } } int dimValue2 = rawDoc.dims2[dim]; if (dimValue2 != -1) { @@ -519,11 +526,15 @@ if (VERBOSE) { System.out.println(" dim" + dim + "=" + new BytesRef(dimValues[dim][dimValue2])); } + if (canUseDV) { + doc.add(new SortedSetDocValuesField("dvFacets", new BytesRef("dim" + dim + "/" + dimValues[dim][dimValue2]))); + } } } if (!paths.isEmpty()) { facetFields.addFields(doc, paths); } + w.addDocument(doc); } @@ -555,6 +566,14 @@ } IndexReader r = w.getReader(); w.close(); + + final SortedSetDocValuesReaderState sortedSetDVState; + if (canUseDV) { + sortedSetDVState = new SortedSetDocValuesReaderState(r, "dvFacets"); + } else { + sortedSetDVState = null; + } + if (VERBOSE) { System.out.println("r.numDocs() = " + r.numDocs()); } @@ -689,8 +708,28 @@ SimpleFacetResult expected = slowDrillSidewaysSearch(s, docs, contentToken, drillDowns, dimValues, filter); Sort sort = new Sort(new SortField("id", SortField.Type.STRING)); - DrillSidewaysResult actual = new DrillSideways(s, tr).search(ddq, filter, null, numDocs, sort, true, true, fsp); + DrillSideways ds; + if (canUseDV && random().nextBoolean()) { + if (VERBOSE) { + System.out.println(" use SortedSetDV"); + } + ds = new DrillSideways(s, null) { + @Override + protected FacetsAccumulator getDrillDownAccumulator(FacetSearchParams fsp) throws IOException { + return new SortedSetDocValuesAccumulator(fsp, sortedSetDVState); + } + @Override + protected FacetsAccumulator getDrillSidewaysAccumulator(String dim, FacetSearchParams fsp) throws IOException { + return new SortedSetDocValuesAccumulator(fsp, sortedSetDVState); + } + }; + } else { + ds = new DrillSideways(s, tr); + } + + DrillSidewaysResult actual = ds.search(ddq, filter, null, numDocs, sort, true, true, fsp); + TopDocs hits = s.search(baseQuery, numDocs); Map scores = new HashMap(); for(ScoreDoc sd : hits.scoreDocs) { Index: lucene/facet/src/java/org/apache/lucene/facet/search/SortedSetDocValuesAccumulator.java =================================================================== --- lucene/facet/src/java/org/apache/lucene/facet/search/SortedSetDocValuesAccumulator.java (revision 0) +++ lucene/facet/src/java/org/apache/lucene/facet/search/SortedSetDocValuesAccumulator.java (working copy) @@ -0,0 +1,242 @@ +package org.apache.lucene.facet.search; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import org.apache.lucene.facet.params.CategoryListParams; +import org.apache.lucene.facet.params.FacetSearchParams; +import org.apache.lucene.facet.search.FacetsCollector.MatchingDocs; +import org.apache.lucene.facet.taxonomy.CategoryPath; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.MultiDocValues.MultiSortedSetDocValues; +import org.apache.lucene.index.MultiDocValues; +import org.apache.lucene.index.SortedSetDocValues; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.PriorityQueue; + +/** A {@link FacetsAccumulator} that uses previously + * indexed {@link SortedSetDocValues} to perform faceting, + * without require a separate taxonomy index. Faceting is + * a bit slower (~25%), and there is added cost on every + * {@link IndexReader} open to create a new {@link + * SortedSetDocValuesReaderState}. Furthermore, this does + * not support hierarchical facets; only flat (dimension + + * label) facets, but it uses quite a bit less RAM to do so. */ +public class SortedSetDocValuesAccumulator extends FacetsAccumulator { + + final SortedSetDocValuesReaderState state; + final SortedSetDocValues dv; + final String field; + + public SortedSetDocValuesAccumulator(FacetSearchParams fsp, SortedSetDocValuesReaderState state) throws IOException { + super(fsp, null, null, new FacetArrays((int) state.getDocValues().getValueCount())); + this.state = state; + this.field = state.getField(); + dv = state.getDocValues(); + + // Check params: + for(FacetRequest request : fsp.facetRequests) { + if (!(request instanceof CountFacetRequest)) { + throw new IllegalArgumentException("this collector only supports CountFacetRequest; got " + request); + } + if (request.categoryPath.length != 1) { + throw new IllegalArgumentException("this collector only supports depth 1 CategoryPath; got " + request.categoryPath); + } + String dim = request.categoryPath.components[0]; + + SortedSetDocValuesReaderState.OrdRange ordRange = state.getOrdRange(dim); + if (ordRange == null) { + throw new IllegalArgumentException("dim \"" + dim + "\" does not exist"); + } + } + } + + @Override + public FacetsAggregator getAggregator() { + + return new FacetsAggregator() { + + @Override + public void aggregate(MatchingDocs matchingDocs, CategoryListParams clp, FacetArrays facetArrays) throws IOException { + + SortedSetDocValues segValues = matchingDocs.context.reader().getSortedSetDocValues(field); + if (segValues == null) { + return; + } + + final int[] counts = facetArrays.getIntArray(); + final int maxDoc = matchingDocs.context.reader().maxDoc(); + assert maxDoc == matchingDocs.bits.length(); + + if (dv instanceof MultiSortedSetDocValues) { + MultiDocValues.OrdinalMap ordinalMap = ((MultiSortedSetDocValues) dv).mapping; + int segOrd = matchingDocs.context.ord; + + int numSegOrds = (int) segValues.getValueCount(); + + if (matchingDocs.totalHits < numSegOrds/10) { + // Remap every ord to global ord as we iterate: + final int[] segCounts = new int[numSegOrds]; + int doc = 0; + while (doc < maxDoc && (doc = matchingDocs.bits.nextSetBit(doc)) != -1) { + segValues.setDocument(doc); + int term = (int) segValues.nextOrd(); + while (term != SortedSetDocValues.NO_MORE_ORDS) { + counts[(int) ordinalMap.getGlobalOrd(segOrd, term)]++; + term = (int) segValues.nextOrd(); + } + ++doc; + } + } else { + + // First count in seg-ord space: + final int[] segCounts = new int[numSegOrds]; + int doc = 0; + while (doc < maxDoc && (doc = matchingDocs.bits.nextSetBit(doc)) != -1) { + segValues.setDocument(doc); + int term = (int) segValues.nextOrd(); + while (term != SortedSetDocValues.NO_MORE_ORDS) { + segCounts[term]++; + term = (int) segValues.nextOrd(); + } + ++doc; + } + + // Then, migrate to global ords: + for(int ord=0;ord { + public TopCountPQ(int topN) { + super(topN, false); + } + + @Override + protected boolean lessThan(FacetResultNode a, FacetResultNode b) { + if (a.value < b.value) { + return true; + } else if (a.value > b.value) { + return false; + } else { + return a.ordinal > b.ordinal; + } + } + } + + @Override + public List accumulate(List matchingDocs) throws IOException { + + FacetsAggregator aggregator = getAggregator(); + for (CategoryListParams clp : getCategoryLists()) { + for (MatchingDocs md : matchingDocs) { + aggregator.aggregate(md, clp, facetArrays); + } + } + + // compute top-K + + List results = new ArrayList(); + + int[] counts = facetArrays.getIntArray(); + + for(FacetRequest request : searchParams.facetRequests) { + String dim = request.categoryPath.components[0]; + SortedSetDocValuesReaderState.OrdRange ordRange = state.getOrdRange(dim); + // checked in ctor: + assert ordRange != null; + + TopCountPQ q = new TopCountPQ(request.numResults); + + int bottomCount = 0; + + //System.out.println("collect"); + int dimCount = 0; + for(int ord=ordRange.start; ord<=ordRange.end; ord++) { + //System.out.println(" ord=" + ord + " count= "+ counts[ord] + " bottomCount=" + bottomCount); + if (counts[ord] > bottomCount) { + dimCount += counts[ord]; + //System.out.println(" keep"); + q.insertWithOverflow(new FacetResultNode(ord, counts[ord])); + if (q.size() == request.numResults) { + bottomCount = (int) q.top().value; + //System.out.println(" new bottom=" + bottomCount); + } + } + } + + CategoryListParams.OrdinalPolicy op = searchParams.indexingParams.getCategoryListParams(request.categoryPath).getOrdinalPolicy(dim); + if (op == CategoryListParams.OrdinalPolicy.ALL_BUT_DIMENSION) { + dimCount = 0; + } + + FacetResultNode rootNode = new FacetResultNode(-1, dimCount); + rootNode.label = new CategoryPath(new String[] {dim}); + + FacetResultNode[] childNodes = new FacetResultNode[q.size()]; + BytesRef scratch = new BytesRef(); + for(int i=childNodes.length-1;i>=0;i--) { + childNodes[i] = q.pop(); + dv.lookupOrd(childNodes[i].ordinal, scratch); + childNodes[i].label = new CategoryPath(scratch.utf8ToString(), state.separator); + } + rootNode.subResults = Arrays.asList(childNodes); + + results.add(new FacetResult(request, rootNode, childNodes.length)); + } + + return results; + } +} Property changes on: lucene/facet/src/java/org/apache/lucene/facet/search/SortedSetDocValuesAccumulator.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/facet/src/java/org/apache/lucene/facet/search/SortedSetDocValuesReaderState.java =================================================================== --- lucene/facet/src/java/org/apache/lucene/facet/search/SortedSetDocValuesReaderState.java (revision 0) +++ lucene/facet/src/java/org/apache/lucene/facet/search/SortedSetDocValuesReaderState.java (working copy) @@ -0,0 +1,148 @@ +package org.apache.lucene.facet.search; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; +import java.util.regex.Pattern; + +import org.apache.lucene.index.AtomicReader; +import org.apache.lucene.index.CompositeReader; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.SlowCompositeReaderWrapper; +import org.apache.lucene.index.SortedSetDocValues; +import org.apache.lucene.util.BytesRef; + +/** Wraps a {@link IndexReader} and resolves ords + * using existing {@link SortedSetDocValues} APIs without a + * separate taxonomy index. This makes faceting a bit + * slower, and adds some cost at reopen time, but avoids + * managing the separate taxonomy index. In addition, the + * tie-break during faceting is now meaningful (in label + * sorted order). + * + *

NOTE: creating an instance of this class is + * somewhat costly, as it computes per-segment ordinal maps, + * so you should create it once and re-use that one instance + * for a given {@link IndexReader}. */ + +public final class SortedSetDocValuesReaderState { + + private final String field; + private final AtomicReader topReader; + private final int valueCount; + final char separator; + + /** Holds start/end range of ords, which maps to one + * dimension (someday we may generalize it to map to + * hierarchies within one dimension). */ + static final class OrdRange { + public final int start; + public final int end; + + /** Start and end are inclusive. */ + public OrdRange(int start, int end) { + this.start = start; + this.end = end; + } + } + + private final Map prefixToOrdRange = new HashMap(); + + /** Create an instance, scanning the {@link + * SortedSetDocValues} from the provided reader and + * field, with the default "/" separarator. */ + public SortedSetDocValuesReaderState(IndexReader reader, String field) throws IOException { + this(reader, field, '/'); + } + + /** Create an instance, scanning the {@link + * SortedSetDocValues} from the provided reader and + * field. separator is a regular expression that should + * match whatever separator you used during indexing to + * join each dimension with its labels. */ + public SortedSetDocValuesReaderState(IndexReader reader, String field, char separator) throws IOException { + this.field = field; + + this.separator = separator; + + String separatorRegex = Pattern.quote(Character.toString(separator)); + + // We need this to create thread-safe MultiSortedSetDV + // per collector: + if (reader instanceof AtomicReader) { + topReader = (AtomicReader) reader; + } else { + topReader = new SlowCompositeReaderWrapper((CompositeReader) reader); + } + SortedSetDocValues dv = topReader.getSortedSetDocValues(field); + if (dv == null) { + throw new IllegalArgumentException("field \"" + field + "\" was not indexed with SortedSetDocValues"); + } + if (dv.getValueCount() > Integer.MAX_VALUE) { + throw new IllegalArgumentException("can only handle valueCount < Integer.MAX_VALUE; got " + dv.getValueCount()); + } + valueCount = (int) dv.getValueCount(); + + // TODO: we can make this more efficient if eg we can be + // "involved" when OrdinalMap is being created? Ie see + // each term/ord it's assigning as it goes... + String lastDim = null; + int startOrd = -1; + BytesRef spare = new BytesRef(); + + // TODO: this approach can work for full hierarchy?; + // TaxoReader can't do this since ords are not in + // "sorted order" ... but we should generalize this to + // support arbitrary hierarchy: + for(int ord=0;ord