Index: TODO =================================================================== --- TODO (revision 0) +++ TODO (working copy) @@ -0,0 +1,25 @@ +nocommit this! + +TODO + - SSDVValueSourceFacets? + - we could put more stuff into the "schema", e.g. this field is + sorted-set-DV and that one is taxo? + - standardize on facet or facets (e.g. FacetIndexWriter) + - fewer args when constructing a range + - varargs to FacetFields.addFields + - can you just add FacetField to the doc? + - rename CategoryPath -> FacetLabel + - how to do avg() agg? + - test needsScores=true / valuesource associations + - drill sideways + - make FieldTypes optional (if all your dims are flat)? + - add hierarchy to ssdv facets? + - sparse faceting: allow skipping of certain dims? + - ords cache + - complements + - sampling + - associations + - maybe an interface/abstract class for "FacetResults"? has common + API, ie to get top facets under a path, get all dims; then DS can + use this? + - consistently name things "dimension"? calling these fields is CONFUSING Index: lucene/facet/src/java/org/apache/lucene/facet/simple/TopOrdCountQueue.java =================================================================== --- lucene/facet/src/java/org/apache/lucene/facet/simple/TopOrdCountQueue.java (revision 0) +++ lucene/facet/src/java/org/apache/lucene/facet/simple/TopOrdCountQueue.java (working copy) @@ -0,0 +1,44 @@ +package org.apache.lucene.facet.simple; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.util.PriorityQueue; + +/** Keeps highest count results. */ +class TopOrdCountQueue extends PriorityQueue { + + public static final class OrdAndCount { + int ord; + int count; + } + + public TopOrdCountQueue(int topN) { + super(topN, false); + } + + @Override + protected boolean lessThan(OrdAndCount a, OrdAndCount b) { + if (a.count < b.count) { + return true; + } else if (a.count > b.count) { + return false; + } else { + return a.ord > b.ord; + } + } +} Property changes on: lucene/facet/src/java/org/apache/lucene/facet/simple/TopOrdCountQueue.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/facet/src/java/org/apache/lucene/facet/simple/FacetField.java =================================================================== --- lucene/facet/src/java/org/apache/lucene/facet/simple/FacetField.java (revision 0) +++ lucene/facet/src/java/org/apache/lucene/facet/simple/FacetField.java (working copy) @@ -0,0 +1,61 @@ +package org.apache.lucene.facet.simple; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.Arrays; + +import org.apache.lucene.document.Field; +import org.apache.lucene.document.FieldType; + +/** Add an instance of this to your Document for every facet + * label. */ +public class FacetField extends Field { + static final FieldType TYPE = new FieldType(); + static { + TYPE.setIndexed(true); + TYPE.freeze(); + } + final String dim; + final String[] path; + private String indexedFieldName = Constants.DEFAULT_FACET_FIELD; + + public FacetField(String dim, String... path) { + super("dummy", TYPE); + this.dim = dim; + if (path.length == 0) { + throw new IllegalArgumentException("path must have at least one element"); + } + this.path = path; + } + + /** Expert: call this if you want to change which + * underlying field will hold these facets from the + * default $facets. */ + public void setIndexedFieldName(String indexedFieldName) { + this.indexedFieldName = indexedFieldName; + } + + public String getIndexedFieldName() { + return indexedFieldName; + } + + @Override + public String toString() { + return "FacetField(dim=" + dim + " path=" + Arrays.toString(path) + ")"; + } +} Property changes on: lucene/facet/src/java/org/apache/lucene/facet/simple/FacetField.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/facet/src/java/org/apache/lucene/facet/simple/SimpleFacetsCollector.java =================================================================== --- lucene/facet/src/java/org/apache/lucene/facet/simple/SimpleFacetsCollector.java (revision 0) +++ lucene/facet/src/java/org/apache/lucene/facet/simple/SimpleFacetsCollector.java (working copy) @@ -0,0 +1,136 @@ +package org.apache.lucene.facet.simple; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import org.apache.lucene.facet.params.FacetSearchParams; +import org.apache.lucene.facet.taxonomy.CategoryPath; +import org.apache.lucene.facet.taxonomy.TaxonomyReader; +import org.apache.lucene.index.AtomicReaderContext; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.search.Collector; +import org.apache.lucene.search.Scorer; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.FixedBitSet; + +/** + * A {@link Collector} which executes faceted search and computes the weight of + * requested facets. To get the facet results you should call + * {@link #getFacetResults()}. + * {@link #create(FacetSearchParams, IndexReader, TaxonomyReader)} returns the + * most optimized {@link FacetsCollector} for the given parameters. + * + * @lucene.experimental + */ +public final class SimpleFacetsCollector extends Collector { + + private AtomicReaderContext context; + private Scorer scorer; + private FixedBitSet bits; + private int totalHits; + private float[] scores; + private final boolean keepScores; + private final List matchingDocs = new ArrayList(); + + /** + * Holds the documents that were matched in the {@link AtomicReaderContext}. + * If scores were required, then {@code scores} is not null. + */ + public final static class MatchingDocs { + + public final AtomicReaderContext context; + public final FixedBitSet bits; + public final float[] scores; + public final int totalHits; + + public MatchingDocs(AtomicReaderContext context, FixedBitSet bits, int totalHits, float[] scores) { + this.context = context; + this.bits = bits; + this.scores = scores; + this.totalHits = totalHits; + } + } + + public SimpleFacetsCollector() { + this(false); + } + + public SimpleFacetsCollector(boolean keepScores) { + this.keepScores = keepScores; + } + + public boolean getKeepScores() { + return keepScores; + } + + /** + * Returns the documents matched by the query, one {@link MatchingDocs} per + * visited segment. + */ + public List getMatchingDocs() { + if (bits != null) { + matchingDocs.add(new MatchingDocs(this.context, bits, totalHits, scores)); + bits = null; + scores = null; + context = null; + } + + return matchingDocs; + } + + @Override + public final boolean acceptsDocsOutOfOrder() { + // nocommit why not true? + return false; + } + + @Override + public final void collect(int doc) throws IOException { + bits.set(doc); + if (keepScores) { + if (totalHits >= scores.length) { + float[] newScores = new float[ArrayUtil.oversize(totalHits + 1, 4)]; + System.arraycopy(scores, 0, newScores, 0, totalHits); + scores = newScores; + } + scores[totalHits] = scorer.score(); + } + totalHits++; + } + + @Override + public final void setScorer(Scorer scorer) throws IOException { + this.scorer = scorer; + } + + @Override + public final void setNextReader(AtomicReaderContext context) throws IOException { + if (bits != null) { + matchingDocs.add(new MatchingDocs(this.context, bits, totalHits, scores)); + } + bits = new FixedBitSet(context.reader().maxDoc()); + totalHits = 0; + if (keepScores) { + scores = new float[64]; // some initial size + } + this.context = context; + } +} Property changes on: lucene/facet/src/java/org/apache/lucene/facet/simple/SimpleFacetsCollector.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/facet/src/java/org/apache/lucene/facet/simple/RangeFacetCounts.java =================================================================== --- lucene/facet/src/java/org/apache/lucene/facet/simple/RangeFacetCounts.java (revision 0) +++ lucene/facet/src/java/org/apache/lucene/facet/simple/RangeFacetCounts.java (working copy) @@ -0,0 +1,93 @@ +package org.apache.lucene.facet.simple; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; + +import org.apache.lucene.facet.range.Range; +import org.apache.lucene.facet.simple.SimpleFacetsCollector.MatchingDocs; +import org.apache.lucene.index.AtomicReaderContext; +import org.apache.lucene.queries.function.FunctionValues; +import org.apache.lucene.queries.function.ValueSource; +import org.apache.lucene.queries.function.valuesource.LongFieldSource; + +/** + * Uses {@link RangeFacetRequest#getValues(AtomicReaderContext)} and accumulates + * counts for provided ranges. + */ +public class RangeFacetCounts { + private final Range[] ranges; + private final int[] counts; + private int totCount; + + public RangeFacetCounts(String field, SimpleFacetsCollector hits, Range... ranges) throws IOException { + this(new LongFieldSource(field), hits, ranges); + } + + public RangeFacetCounts(ValueSource valueSource, SimpleFacetsCollector hits, Range... ranges) throws IOException { + this.ranges = ranges; + counts = new int[ranges.length]; + count(valueSource, hits.getMatchingDocs()); + } + + private void count(ValueSource valueSource, List matchingDocs) throws IOException { + + // TODO: test if this is faster (in the past it was + // faster to do MatchingDocs on the inside) ... see + // patches on LUCENE-4965): + for (MatchingDocs hits : matchingDocs) { + FunctionValues fv = valueSource.getValues(Collections.emptyMap(), hits.context); + final int length = hits.bits.length(); + int doc = 0; + totCount += hits.totalHits; + while (doc < length && (doc = hits.bits.nextSetBit(doc)) != -1) { + // Skip missing docs: + if (fv.exists(doc)) { + + long v = fv.longVal(doc); + + // TODO: if all ranges are non-overlapping, we + // should instead do a bin-search up front + // (really, a specialized case of the interval + // tree) + // TODO: use interval tree instead of linear search: + for (int j = 0; j < ranges.length; j++) { + if (ranges[j].accept(v)) { + counts[j]++; + } + } + } + + doc++; + } + } + } + + public SimpleFacetResult getCounts() { + LabelAndValue[] labelValues = new LabelAndValue[counts.length]; + for(int i=0;i> byField = new HashMap>(); + + // ... and also all SortedSetDocValuesFacetFields: + Map> dvByField = new HashMap>(); + + for(IndexableField field : doc.indexableFields()) { + if (field.fieldType() == FacetField.TYPE) { + FacetField facetField = (FacetField) field; + String indexedFieldName = facetField.getIndexedFieldName(); + List fields = byField.get(indexedFieldName); + if (fields == null) { + fields = new ArrayList(); + byField.put(indexedFieldName, fields); + } + fields.add(facetField); + } + + if (field.fieldType() == SortedSetDocValuesFacetField.TYPE) { + SortedSetDocValuesFacetField facetField = (SortedSetDocValuesFacetField) field; + String indexedFieldName = facetField.getIndexedFieldName(); + List fields = dvByField.get(indexedFieldName); + if (fields == null) { + fields = new ArrayList(); + dvByField.put(indexedFieldName, fields); + } + fields.add(facetField); + } + } + + List addedIndexedFields = new ArrayList(); + List addedStoredFields = new ArrayList(); + + processFacetFields(byField, addedIndexedFields, addedStoredFields); + processSSDVFacetFields(dvByField, addedIndexedFields, addedStoredFields); + + //System.out.println("add stored: " + addedStoredFields); + + final List allIndexedFields = new ArrayList(); + for(IndexableField field : doc.indexableFields()) { + IndexableFieldType ft = field.fieldType(); + if (ft != FacetField.TYPE && ft != SortedSetDocValuesFacetField.TYPE) { + allIndexedFields.add(field); + } + } + allIndexedFields.addAll(addedIndexedFields); + + final List allStoredFields = new ArrayList(); + for(StorableField field : doc.storableFields()) { + allStoredFields.add(field); + } + allStoredFields.addAll(addedStoredFields); + + //System.out.println("all indexed: " + allIndexedFields); + //System.out.println("all stored: " + allStoredFields); + + super.addDocument(new IndexDocument() { + @Override + public Iterable indexableFields() { + return allIndexedFields; + } + + @Override + public Iterable storableFields() { + return allStoredFields; + } + }); + } + + private void processFacetFields(Map> byField, List addedIndexedFields, List addedStoredFields) throws IOException { + + for(Map.Entry> ent : byField.entrySet()) { + + // nocommit maybe we can somehow catch singleValued + // dim appearing more than once? + + String indexedFieldName = ent.getKey(); + //System.out.println(" fields=" + ent.getValue()); + + IntsRef ordinals = new IntsRef(32); + for(FacetField facetField : ent.getValue()) { + + FacetsConfig.DimConfig ft = facetsConfig.getDimConfig(facetField.dim); + if (facetField.path.length > 1 && ft.hierarchical == false) { + throw new IllegalArgumentException("dimension \"" + facetField.dim + "\" is not hierarchical yet has " + facetField.path.length + " components"); + } + + // Append dim and path: + String[] fullPath = new String[facetField.path.length+1]; + fullPath[0] = facetField.dim; + System.arraycopy(facetField.path, 0, fullPath, 1, facetField.path.length); + + CategoryPath cp = new CategoryPath(fullPath); + + int ordinal = taxoWriter.addCategory(cp); + ordinals.ints[ordinals.length++] = ordinal; + //System.out.println(" add cp=" + cp); + + if (ft.hierarchical && ft.multiValued) { + // Add all parents too: + int parent = taxoWriter.getParent(ordinal); + while (parent > 0) { + if (ordinals.ints.length == ordinals.length) { + ordinals.grow(ordinals.length+1); + } + ordinals.ints[ordinals.length++] = parent; + parent = taxoWriter.getParent(parent); + } + } + + // Drill down: + for(int i=2;i<=cp.length;i++) { + addedIndexedFields.add(new StringField(indexedFieldName, cp.subpath(i).toString(facetDelimChar), Field.Store.NO)); + } + } + + // Facet counts: + // DocValues are considered stored fields: + addedStoredFields.add(new BinaryDocValuesField(indexedFieldName, dedupAndEncode(ordinals))); + } + } + + private void processSSDVFacetFields(Map> byField, List addedIndexedFields, List addedStoredFields) throws IOException { + //System.out.println("process SSDV: " + byField); + for(Map.Entry> ent : byField.entrySet()) { + + String indexedFieldName = ent.getKey(); + //System.out.println(" field=" + indexedFieldName); + + for(SortedSetDocValuesFacetField facetField : ent.getValue()) { + CategoryPath cp = new CategoryPath(facetField.dim, facetField.label); + String fullPath = cp.toString(facetDelimChar); + //System.out.println("add " + fullPath); + + // For facet counts: + addedStoredFields.add(new SortedSetDocValuesField(indexedFieldName, new BytesRef(fullPath))); + + // For drill-down: + addedIndexedFields.add(new StringField(indexedFieldName, fullPath, Field.Store.NO)); + } + } + } + + /** We can open this up if/when we really need + * pluggability on the encoding. */ + private final BytesRef dedupAndEncode(IntsRef ordinals) { + Arrays.sort(ordinals.ints, ordinals.offset, ordinals.length); + byte[] bytes = new byte[5*ordinals.length]; + int lastOrd = -1; + int upto = 0; + for(int i=0;i lastOrd) { + int delta; + if (lastOrd == -1) { + delta = ord; + } else { + delta = ord - lastOrd; + } + if ((delta & ~0x7F) == 0) { + bytes[upto] = (byte) delta; + upto++; + } else if ((delta & ~0x3FFF) == 0) { + bytes[upto] = (byte) (0x80 | ((delta & 0x3F80) >> 7)); + bytes[upto + 1] = (byte) (delta & 0x7F); + upto += 2; + } else if ((delta & ~0x1FFFFF) == 0) { + bytes[upto] = (byte) (0x80 | ((delta & 0x1FC000) >> 14)); + bytes[upto + 1] = (byte) (0x80 | ((delta & 0x3F80) >> 7)); + bytes[upto + 2] = (byte) (delta & 0x7F); + upto += 3; + } else if ((delta & ~0xFFFFFFF) == 0) { + bytes[upto] = (byte) (0x80 | ((delta & 0xFE00000) >> 21)); + bytes[upto + 1] = (byte) (0x80 | ((delta & 0x1FC000) >> 14)); + bytes[upto + 2] = (byte) (0x80 | ((delta & 0x3F80) >> 7)); + bytes[upto + 3] = (byte) (delta & 0x7F); + upto += 4; + } else { + bytes[upto] = (byte) (0x80 | ((delta & 0xF0000000) >> 28)); + bytes[upto + 1] = (byte) (0x80 | ((delta & 0xFE00000) >> 21)); + bytes[upto + 2] = (byte) (0x80 | ((delta & 0x1FC000) >> 14)); + bytes[upto + 3] = (byte) (0x80 | ((delta & 0x3F80) >> 7)); + bytes[upto + 4] = (byte) (delta & 0x7F); + upto += 5; + } + lastOrd = ord; + } + } + return new BytesRef(bytes, 0, upto); + } + +} \ No newline at end of file Property changes on: lucene/facet/src/java/org/apache/lucene/facet/simple/FacetIndexWriter.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/facet/src/java/org/apache/lucene/facet/simple/TopOrdValueQueue.java =================================================================== --- lucene/facet/src/java/org/apache/lucene/facet/simple/TopOrdValueQueue.java (revision 0) +++ lucene/facet/src/java/org/apache/lucene/facet/simple/TopOrdValueQueue.java (working copy) @@ -0,0 +1,46 @@ +package org.apache.lucene.facet.simple; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.util.PriorityQueue; + +// nocommit make value a double and merge with TopOrdCountValueQueue? + +/** Keeps highest results. */ +class TopOrdValueQueue extends PriorityQueue { + + public static final class OrdAndValue { + int ord; + float value; + } + + public TopOrdValueQueue(int topN) { + super(topN, false); + } + + @Override + protected boolean lessThan(OrdAndValue a, OrdAndValue b) { + if (a.value < b.value) { + return true; + } else if (a.value > b.value) { + return false; + } else { + return a.ord > b.ord; + } + } +} Property changes on: lucene/facet/src/java/org/apache/lucene/facet/simple/TopOrdValueQueue.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/facet/src/java/org/apache/lucene/facet/simple/SortedSetDocValuesFacetFields.java =================================================================== --- lucene/facet/src/java/org/apache/lucene/facet/simple/SortedSetDocValuesFacetFields.java (revision 0) +++ lucene/facet/src/java/org/apache/lucene/facet/simple/SortedSetDocValuesFacetFields.java (working copy) @@ -0,0 +1,80 @@ +package org.apache.lucene.facet.simple; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Map.Entry; +import java.util.Map; +import java.util.Collections; + +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.SortedSetDocValuesField; +import org.apache.lucene.document.StringField; +import org.apache.lucene.facet.index.DrillDownStream; +import org.apache.lucene.facet.index.FacetFields; +import org.apache.lucene.facet.params.CategoryListParams; +import org.apache.lucene.facet.params.FacetIndexingParams; +import org.apache.lucene.facet.taxonomy.CategoryPath; +import org.apache.lucene.util.BytesRef; + +/** Use this to index facets if you intend to + * use {@link SortedSetDocValuesAccumulator} to count facets + * at search time. Note that this only supports flat + * facets (dimension + label). Instantiate this class + * once, and then call {@link #addFields} to add the + * necessary fields to each {@link Document}. */ + +public class SortedSetDocValuesFacetFields { + private final String dvFieldName; + private final char delimChar; + + /** Create a {@code SortedSetDocValuesFacetField} with the + * provided {@link CategoryPath}. */ + public SortedSetDocValuesFacetFields(String dvFieldName) { + this(dvFieldName, Constants.DEFAULT_DELIM_CHAR); + } + + /** Create a {@code SortedSetDocValuesFacetField} with the + * provided {@link CategoryPath}. */ + public SortedSetDocValuesFacetFields(String dvFieldName, char delimChar) { + this.dvFieldName = dvFieldName; + this.delimChar = delimChar; + } + + /** Sugar */ + public void addFields(Document doc, CategoryPath category) throws IOException { + addFields(doc, Collections.singletonList(category)); + } + + public void addFields(Document doc, Iterable categories) throws IOException { + // Add sorted-set DV fields, one per value: + for(CategoryPath cp : categories) { + if (cp.length != 2) { + throw new IllegalArgumentException("only flat facets (dimension + label) are currently supported; got " + cp); + } + String fullPath = cp.toString(delimChar); + + // For facet-counts: + doc.add(new SortedSetDocValuesField(dvFieldName, new BytesRef(fullPath))); + + // For drill-down: + doc.add(new StringField(dvFieldName, fullPath, Field.Store.NO)); + } + } +} Property changes on: lucene/facet/src/java/org/apache/lucene/facet/simple/SortedSetDocValuesFacetFields.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/facet/src/java/org/apache/lucene/facet/simple/TaxonomyFacetSumValueSource.java =================================================================== --- lucene/facet/src/java/org/apache/lucene/facet/simple/TaxonomyFacetSumValueSource.java (revision 0) +++ lucene/facet/src/java/org/apache/lucene/facet/simple/TaxonomyFacetSumValueSource.java (working copy) @@ -0,0 +1,254 @@ +package org.apache.lucene.facet.simple; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.apache.lucene.facet.simple.SimpleFacetsCollector.MatchingDocs; +import org.apache.lucene.facet.taxonomy.CategoryPath; +import org.apache.lucene.facet.taxonomy.ParallelTaxonomyArrays; +import org.apache.lucene.facet.taxonomy.TaxonomyReader; +import org.apache.lucene.index.BinaryDocValues; +import org.apache.lucene.queries.function.FunctionValues; +import org.apache.lucene.queries.function.ValueSource; +import org.apache.lucene.search.Scorer; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.FixedBitSet; + +/** Aggregates sum of values from a {@link ValueSource}, for + * each facet label. */ + +public class TaxonomyFacetSumValueSource { + private final FacetsConfig facetsConfig; + private final TaxonomyReader taxoReader; + private final float[] values; + private final String facetsFieldName; + private final int[] children; + private final int[] parents; + private final int[] siblings; + + public TaxonomyFacetSumValueSource(TaxonomyReader taxoReader, FacetsConfig facetsConfig, SimpleFacetsCollector fc, ValueSource valueSource) throws IOException { + this(Constants.DEFAULT_FACET_FIELD, taxoReader, facetsConfig, fc, valueSource); + } + + public TaxonomyFacetSumValueSource(String facetsFieldName, TaxonomyReader taxoReader, FacetsConfig facetsConfig, SimpleFacetsCollector fc, ValueSource valueSource) throws IOException { + this.taxoReader = taxoReader; + this.facetsFieldName = facetsFieldName; + this.facetsConfig = facetsConfig; + ParallelTaxonomyArrays pta = taxoReader.getParallelTaxonomyArrays(); + children = pta.children(); + parents = pta.parents(); + siblings = pta.siblings(); + values = new float[taxoReader.getSize()]; + sumValues(fc.getMatchingDocs(), fc.getKeepScores(), valueSource); + } + + private static final class FakeScorer extends Scorer { + float score; + int docID; + FakeScorer() { super(null); } + @Override public float score() throws IOException { return score; } + @Override public int freq() throws IOException { throw new UnsupportedOperationException(); } + @Override public int docID() { return docID; } + @Override public int nextDoc() throws IOException { throw new UnsupportedOperationException(); } + @Override public int advance(int target) throws IOException { throw new UnsupportedOperationException(); } + @Override public long cost() { return 0; } + } + + private final void sumValues(List matchingDocs, boolean keepScores, ValueSource valueSource) throws IOException { + final FakeScorer scorer = new FakeScorer(); + Map context = new HashMap(); + context.put("scorer", scorer); + for(MatchingDocs hits : matchingDocs) { + BinaryDocValues dv = hits.context.reader().getBinaryDocValues(facetsFieldName); + if (dv == null) { // this reader does not have DocValues for the requested category list + continue; + } + FixedBitSet bits = hits.bits; + + final int length = hits.bits.length(); + int doc = 0; + int scoresIdx = 0; + BytesRef scratch = new BytesRef(); + float[] scores = hits.scores; + + FunctionValues functionValues = valueSource.getValues(context, hits.context); + while (doc < length && (doc = bits.nextSetBit(doc)) != -1) { + dv.get(doc, scratch); + if (keepScores) { + scorer.docID = doc; + scorer.score = scores[scoresIdx++]; + } + byte[] bytes = scratch.bytes; + int end = scratch.offset + scratch.length; + int ord = 0; + int offset = scratch.offset; + int prev = 0; + + float value = (float) functionValues.doubleVal(doc); + + while (offset < end) { + byte b = bytes[offset++]; + if (b >= 0) { + prev = ord = ((ord << 7) | b) + prev; + values[ord] += value; + ord = 0; + } else { + ord = (ord << 7) | (b & 0x7F); + } + } + ++doc; + } + } + + // nocommit we could do this lazily instead: + + // Rollup any necessary dims: + for(Map.Entry ent : facetsConfig.getDimConfigs().entrySet()) { + String dim = ent.getKey(); + FacetsConfig.DimConfig ft = ent.getValue(); + if (ft.hierarchical && ft.multiValued == false) { + int dimRootOrd = taxoReader.getOrdinal(new CategoryPath(dim)); + assert dimRootOrd > 0; + values[dimRootOrd] += rollup(children[dimRootOrd]); + } + } + } + + private float rollup(int ord) { + float sum = 0; + while (ord != TaxonomyReader.INVALID_ORDINAL) { + float childValue = values[ord] + rollup(children[ord]); + values[ord] = childValue; + sum += childValue; + ord = siblings[ord]; + } + return sum; + } + + /** Return the count for a specific path. Returns -1 if + * this path doesn't exist, else the count. */ + public float getSpecificValue(CategoryPath path) throws IOException { + int ord = taxoReader.getOrdinal(path); + if (ord < 0) { + return -1; + } + return values[ord]; + } + + /** Sugar, for flat fields. */ + public SimpleFacetResult getDim(String dim, int topN) throws IOException { + return getTopChildren(new CategoryPath(dim), topN); + } + + /** Returns null if this path doesn't exist or all counts + * were 0, else topN children under the specified path. */ + public SimpleFacetResult getTopChildren(CategoryPath path, int topN) throws IOException { + int ord = taxoReader.getOrdinal(path); + if (ord == -1) { + return null; + } + return getTopChildren(path, ord, topN); + } + + private SimpleFacetResult getTopChildren(CategoryPath path, int dimOrd, int topN) throws IOException { + + TopOrdValueQueue q = new TopOrdValueQueue(topN); + + float bottomValue = 0; + + int ord = children[dimOrd]; + float sumValues = 0; + + TopOrdValueQueue.OrdAndValue reuse = null; + while(ord != TaxonomyReader.INVALID_ORDINAL) { + if (values[ord] > 0) { + sumValues += values[ord]; + if (values[ord] > bottomValue) { + if (reuse == null) { + reuse = new TopOrdValueQueue.OrdAndValue(); + } + reuse.ord = ord; + reuse.value = values[ord]; + reuse = q.insertWithOverflow(reuse); + if (q.size() == topN) { + bottomValue = q.top().value; + } + } + } + + ord = siblings[ord]; + } + + if (sumValues == 0) { + return null; + } + + FacetsConfig.DimConfig ft = facetsConfig.getDimConfig(path.components[0]); + if (ft.hierarchical && ft.multiValued) { + sumValues = values[dimOrd]; + } + + LabelAndValue[] labelValues = new LabelAndValue[q.size()]; + for(int i=labelValues.length-1;i>=0;i--) { + TopOrdValueQueue.OrdAndValue ordAndValue = q.pop(); + CategoryPath child = taxoReader.getPath(ordAndValue.ord); + labelValues[i] = new LabelAndValue(child.components[path.length], ordAndValue.value); + } + + return new SimpleFacetResult(path, sumValues, labelValues); + } + + /** Returns topN labels for any dimension that had hits, + * sorted by the number of hits that dimension matched. */ + public List getAllDims(int topN) throws IOException { + int ord = children[TaxonomyReader.ROOT_ORDINAL]; + List results = new ArrayList(); + while (ord != TaxonomyReader.INVALID_ORDINAL) { + SimpleFacetResult result = getTopChildren(taxoReader.getPath(ord), ord, topN); + if (result != null) { + results.add(result); + } + ord = siblings[ord]; + } + + // Sort by highest count: + Collections.sort(results, + new Comparator() { + @Override + public int compare(SimpleFacetResult a, SimpleFacetResult b) { + if (a.value.floatValue() > b.value.floatValue()) { + return -1; + } else if (b.value.floatValue() > a.value.floatValue()) { + return 1; + } else { + // Tie break by dimension + return a.path.components[0].compareTo(b.path.components[0]); + } + } + }); + + return results; + } +} Property changes on: lucene/facet/src/java/org/apache/lucene/facet/simple/TaxonomyFacetSumValueSource.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/facet/src/java/org/apache/lucene/facet/simple/SortedSetDocValuesFacetField.java =================================================================== --- lucene/facet/src/java/org/apache/lucene/facet/simple/SortedSetDocValuesFacetField.java (revision 0) +++ lucene/facet/src/java/org/apache/lucene/facet/simple/SortedSetDocValuesFacetField.java (working copy) @@ -0,0 +1,58 @@ +package org.apache.lucene.facet.simple; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.Arrays; + +import org.apache.lucene.document.Field; +import org.apache.lucene.document.FieldType; + +/** Add an instance of this to your Document for every facet + * label to be indexed via SortedSetDocValues. */ +public class SortedSetDocValuesFacetField extends Field { + static final FieldType TYPE = new FieldType(); + static { + TYPE.setIndexed(true); + TYPE.freeze(); + } + final String dim; + final String label; + private String indexedFieldName = Constants.DEFAULT_FACET_FIELD; + + public SortedSetDocValuesFacetField(String dim, String label) { + super("dummy", TYPE); + this.dim = dim; + this.label = label; + } + + /** Expert: call this if you want to change which + * underlying field will hold these facets from the + * default $dvFacets. */ + public void setIndexedFieldName(String indexedFieldName) { + this.indexedFieldName = indexedFieldName; + } + + public String getIndexedFieldName() { + return indexedFieldName; + } + + @Override + public String toString() { + return "SortedSetDocValuesFacetField(dim=" + dim + " label=" + label + ")"; + } +} Property changes on: lucene/facet/src/java/org/apache/lucene/facet/simple/SortedSetDocValuesFacetField.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/facet/src/java/org/apache/lucene/facet/simple/FacetsConfig.java =================================================================== --- lucene/facet/src/java/org/apache/lucene/facet/simple/FacetsConfig.java (revision 0) +++ lucene/facet/src/java/org/apache/lucene/facet/simple/FacetsConfig.java (working copy) @@ -0,0 +1,73 @@ +package org.apache.lucene.facet.simple; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; + +/** By default a dimension is flat and single valued; use + * the setters in this class to change that for any dims */ +public class FacetsConfig { + + // nocommit pull the delim char into there? + // nocommit pull DimType into here (shai?) + // nocommit pull facet field ($facets) into here, instead + // of optionally setting it on FacetField + + private final Map fieldTypes = new ConcurrentHashMap(); + + /** @lucene.internal */ + // nocommit expose this to the user, vs the setters? + public static final class DimConfig { + boolean hierarchical; + boolean multiValued; + } + + public final static DimConfig DEFAULT_FIELD_TYPE = new DimConfig(); + + public DimConfig getDimConfig(String name) { + DimConfig ft = fieldTypes.get(name); + if (ft == null) { + ft = DEFAULT_FIELD_TYPE; + } + return ft; + } + + // nocommit maybe setDimConfig instead? + public synchronized void setHierarchical(String name) { + DimConfig ft = fieldTypes.get(name); + if (ft == null) { + ft = new DimConfig(); + fieldTypes.put(name, ft); + } + ft.hierarchical = true; + } + + public synchronized void setMultiValued(String name) { + DimConfig ft = fieldTypes.get(name); + if (ft == null) { + ft = new DimConfig(); + fieldTypes.put(name, ft); + } + ft.multiValued = true; + } + + Map getDimConfigs() { + return fieldTypes; + } +} Property changes on: lucene/facet/src/java/org/apache/lucene/facet/simple/FacetsConfig.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/facet/src/java/org/apache/lucene/facet/simple/SimpleFacetResult.java =================================================================== --- lucene/facet/src/java/org/apache/lucene/facet/simple/SimpleFacetResult.java (revision 0) +++ lucene/facet/src/java/org/apache/lucene/facet/simple/SimpleFacetResult.java (working copy) @@ -0,0 +1,55 @@ +package org.apache.lucene.facet.simple; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.List; +import org.apache.lucene.facet.taxonomy.CategoryPath; + +public final class SimpleFacetResult { + /** Path whose children we counted. */ + public final CategoryPath path; + + /** Total value for this path (sum of all child counts, or + * sum of all child values), even those not included in + * the topN. */ + public Number value; + + /** Child counts. */ + public final LabelAndValue[] labelValues; + + public SimpleFacetResult(CategoryPath path, Number value, LabelAndValue[] labelValues) { + this.path = path; + this.value = value; + this.labelValues = labelValues; + } + + @Override + public String toString() { + StringBuilder sb = new StringBuilder(); + if (path == null) { + sb.append("null"); + } else { + sb.append(path.toString()); + } + sb.append(" (" + value + ")\n"); + for(LabelAndValue labelValue : labelValues) { + sb.append(" " + labelValue + "\n"); + } + return sb.toString(); + } +} Property changes on: lucene/facet/src/java/org/apache/lucene/facet/simple/SimpleFacetResult.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/facet/src/java/org/apache/lucene/facet/simple/SimpleDrillDownQuery.java =================================================================== --- lucene/facet/src/java/org/apache/lucene/facet/simple/SimpleDrillDownQuery.java (revision 0) +++ lucene/facet/src/java/org/apache/lucene/facet/simple/SimpleDrillDownQuery.java (working copy) @@ -0,0 +1,223 @@ +package org.apache.lucene.facet.simple; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; + +import org.apache.lucene.facet.params.CategoryListParams; +import org.apache.lucene.facet.taxonomy.CategoryPath; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.BooleanClause.Occur; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.ConstantScoreQuery; +import org.apache.lucene.search.Filter; +import org.apache.lucene.search.FilteredQuery; +import org.apache.lucene.search.MatchAllDocsQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.TermQuery; + +/** + * A {@link Query} for drill-down over {@link CategoryPath categories}. You + * should call {@link #add(CategoryPath...)} for every group of categories you + * want to drill-down over. Each category in the group is {@code OR'ed} with + * the others, and groups are {@code AND'ed}. + *

+ * NOTE: if you choose to create your own {@link Query} by calling + * {@link #term}, it is recommended to wrap it with {@link ConstantScoreQuery} + * and set the {@link ConstantScoreQuery#setBoost(float) boost} to {@code 0.0f}, + * so that it does not affect the scores of the documents. + * + * @lucene.experimental + */ +public final class SimpleDrillDownQuery extends Query { + + private static Term term(String field, char delimChar, CategoryPath path) { + return new Term(field, path.toString(delimChar)); + } + + private final BooleanQuery query; + private final Map drillDownDims = new LinkedHashMap(); + + /** Used by clone() */ + SimpleDrillDownQuery(BooleanQuery query, Map drillDownDims) { + this.query = query.clone(); + this.drillDownDims.putAll(drillDownDims); + } + + /** Used by DrillSideways */ + SimpleDrillDownQuery(Filter filter, SimpleDrillDownQuery other) { + query = new BooleanQuery(true); // disable coord + + BooleanClause[] clauses = other.query.getClauses(); + if (clauses.length == other.drillDownDims.size()) { + throw new IllegalArgumentException("cannot apply filter unless baseQuery isn't null; pass ConstantScoreQuery instead"); + } + assert clauses.length == 1+other.drillDownDims.size(): clauses.length + " vs " + (1+other.drillDownDims.size()); + drillDownDims.putAll(other.drillDownDims); + query.add(new FilteredQuery(clauses[0].getQuery(), filter), Occur.MUST); + for(int i=1;i clauses, Map drillDownDims) { + this.query = new BooleanQuery(true); + if (baseQuery != null) { + query.add(baseQuery, Occur.MUST); + } + for(Query clause : clauses) { + query.add(clause, Occur.MUST); + } + this.drillDownDims.putAll(drillDownDims); + } + + /** + * Creates a new {@link DrillDownQuery} without a base query, + * to perform a pure browsing query (equivalent to using + * {@link MatchAllDocsQuery} as base). + */ + public SimpleDrillDownQuery() { + this(null); + } + + /** + * Creates a new {@link DrillDownQuery} over the given base query. Can be + * {@code null}, in which case the result {@link Query} from + * {@link #rewrite(IndexReader)} will be a pure browsing query, filtering on + * the added categories only. + */ + public SimpleDrillDownQuery(Query baseQuery) { + query = new BooleanQuery(true); // disable coord + if (baseQuery != null) { + query.add(baseQuery, Occur.MUST); + } + } + + /** + * Adds one dimension of drill downs; if you pass multiple values they are + * OR'd, and then the entire dimension is AND'd against the base query. + */ + // nocommit can we remove CatPath here? + public void add(CategoryPath... paths) { + add(Constants.DEFAULT_FACET_FIELD, Constants.DEFAULT_DELIM_CHAR, paths); + } + + // nocommit can we remove CatPath here? + public void add(String field, CategoryPath... paths) { + add(field, Constants.DEFAULT_DELIM_CHAR, paths); + } + + // nocommit can we remove CatPath here? + public void add(String field, char delimChar, CategoryPath... paths) { + Query q; + if (paths[0].length == 0) { + throw new IllegalArgumentException("all CategoryPaths must have length > 0"); + } + String dim = paths[0].components[0]; + if (drillDownDims.containsKey(dim)) { + throw new IllegalArgumentException("dimension '" + dim + "' was already added"); + } + if (paths.length == 1) { + q = new TermQuery(term(field, delimChar, paths[0])); + } else { + BooleanQuery bq = new BooleanQuery(true); // disable coord + for (CategoryPath cp : paths) { + if (cp.length == 0) { + throw new IllegalArgumentException("all CategoryPaths must have length > 0"); + } + if (!cp.components[0].equals(dim)) { + throw new IllegalArgumentException("multiple (OR'd) drill-down paths must be under same dimension; got '" + + dim + "' and '" + cp.components[0] + "'"); + } + bq.add(new TermQuery(term(field, delimChar, cp)), Occur.SHOULD); + } + q = bq; + } + + add(dim, q); + } + + /** Expert: add a custom drill-down subQuery. Use this + * when you have a separate way to drill-down on the + * dimension than the indexed facet ordinals. */ + public void add(String dim, Query subQuery) { + + // TODO: we should use FilteredQuery? + + // So scores of the drill-down query don't have an + // effect: + final ConstantScoreQuery drillDownQuery = new ConstantScoreQuery(subQuery); + drillDownQuery.setBoost(0.0f); + + query.add(drillDownQuery, Occur.MUST); + + drillDownDims.put(dim, drillDownDims.size()); + } + + @Override + public SimpleDrillDownQuery clone() { + return new SimpleDrillDownQuery(query, drillDownDims); + } + + @Override + public int hashCode() { + final int prime = 31; + int result = super.hashCode(); + return prime * result + query.hashCode(); + } + + @Override + public boolean equals(Object obj) { + if (!(obj instanceof SimpleDrillDownQuery)) { + return false; + } + + SimpleDrillDownQuery other = (SimpleDrillDownQuery) obj; + return query.equals(other.query) && super.equals(other); + } + + @Override + public Query rewrite(IndexReader r) throws IOException { + if (query.clauses().size() == 0) { + // baseQuery given to the ctor was null + no drill-downs were added + // note that if only baseQuery was given to the ctor, but no drill-down terms + // is fine, since the rewritten query will be the original base query. + throw new IllegalStateException("no base query or drill-down categories given"); + } + return query; + } + + @Override + public String toString(String field) { + return query.toString(field); + } + + BooleanQuery getBooleanQuery() { + return query; + } + + Map getDims() { + return drillDownDims; + } +} Property changes on: lucene/facet/src/java/org/apache/lucene/facet/simple/SimpleDrillDownQuery.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/facet/src/java/org/apache/lucene/facet/simple/SortedSetDocValuesReaderState.java =================================================================== --- lucene/facet/src/java/org/apache/lucene/facet/simple/SortedSetDocValuesReaderState.java (revision 0) +++ lucene/facet/src/java/org/apache/lucene/facet/simple/SortedSetDocValuesReaderState.java (working copy) @@ -0,0 +1,156 @@ +package org.apache.lucene.facet.simple; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; +import java.util.regex.Pattern; + +import org.apache.lucene.facet.params.CategoryListParams; +import org.apache.lucene.facet.params.FacetIndexingParams; +import org.apache.lucene.index.AtomicReader; +import org.apache.lucene.index.CompositeReader; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.SlowCompositeReaderWrapper; +import org.apache.lucene.index.SortedSetDocValues; +import org.apache.lucene.util.BytesRef; + +/** Wraps a {@link IndexReader} and resolves ords + * using existing {@link SortedSetDocValues} APIs without a + * separate taxonomy index. This only supports flat facets + * (dimension + label), and it makes faceting a bit + * slower, adds some cost at reopen time, but avoids + * managing the separate taxonomy index. It also requires + * less RAM than the taxonomy index, as it manages the flat + * (2-level) hierarchy more efficiently. In addition, the + * tie-break during faceting is now meaningful (in label + * sorted order). + * + *

NOTE: creating an instance of this class is + * somewhat costly, as it computes per-segment ordinal maps, + * so you should create it once and re-use that one instance + * for a given {@link IndexReader}. */ + +public final class SortedSetDocValuesReaderState { + + private final String field; + private final AtomicReader topReader; + private final int valueCount; + public final IndexReader origReader; + public final char separator; + final String separatorRegex; + + /** Holds start/end range of ords, which maps to one + * dimension (someday we may generalize it to map to + * hierarchies within one dimension). */ + public static final class OrdRange { + /** Start of range, inclusive: */ + public final int start; + /** End of range, inclusive: */ + public final int end; + + /** Start and end are inclusive. */ + public OrdRange(int start, int end) { + this.start = start; + this.end = end; + } + } + + private final Map prefixToOrdRange = new HashMap(); + + public SortedSetDocValuesReaderState(IndexReader reader) throws IOException { + this(reader, Constants.DEFAULT_FACET_FIELD, Constants.DEFAULT_DELIM_CHAR); + } + + public SortedSetDocValuesReaderState(IndexReader reader, String dvField) throws IOException { + this(reader, dvField, Constants.DEFAULT_DELIM_CHAR); + } + + /** Create an instance, scanning the {@link + * SortedSetDocValues} from the provided reader, with + * default {@link FacetIndexingParams}. */ + public SortedSetDocValuesReaderState(IndexReader reader, String field, char delimChar) throws IOException { + + this.field = field; + this.separator = delimChar; + this.separatorRegex = Pattern.quote(Character.toString(separator)); + this.origReader = reader; + + // We need this to create thread-safe MultiSortedSetDV + // per collector: + topReader = SlowCompositeReaderWrapper.wrap(reader); + SortedSetDocValues dv = topReader.getSortedSetDocValues(field); + if (dv == null) { + throw new IllegalArgumentException("field \"" + field + "\" was not indexed with SortedSetDocValues"); + } + if (dv.getValueCount() > Integer.MAX_VALUE) { + throw new IllegalArgumentException("can only handle valueCount < Integer.MAX_VALUE; got " + dv.getValueCount()); + } + valueCount = (int) dv.getValueCount(); + + // TODO: we can make this more efficient if eg we can be + // "involved" when OrdinalMap is being created? Ie see + // each term/ord it's assigning as it goes... + String lastDim = null; + int startOrd = -1; + BytesRef spare = new BytesRef(); + + // TODO: this approach can work for full hierarchy?; + // TaxoReader can't do this since ords are not in + // "sorted order" ... but we should generalize this to + // support arbitrary hierarchy: + for(int ord=0;ord getPrefixToOrdRange() { + return prefixToOrdRange; + } + + public OrdRange getOrdRange(String dim) { + return prefixToOrdRange.get(dim); + } + + public String getField() { + return field; + } + + public int getSize() { + return valueCount; + } +} Property changes on: lucene/facet/src/java/org/apache/lucene/facet/simple/SortedSetDocValuesReaderState.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/facet/src/java/org/apache/lucene/facet/simple/Constants.java =================================================================== --- lucene/facet/src/java/org/apache/lucene/facet/simple/Constants.java (revision 0) +++ lucene/facet/src/java/org/apache/lucene/facet/simple/Constants.java (working copy) @@ -0,0 +1,28 @@ +package org.apache.lucene.facet.simple; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// nocommit jdocs +public final class Constants { + public static final char DEFAULT_DELIM_CHAR = '\u001F'; + public static final String DEFAULT_FACET_FIELD = "$facets"; + + private Constants() { + // no + } +} Property changes on: lucene/facet/src/java/org/apache/lucene/facet/simple/Constants.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/facet/src/java/org/apache/lucene/facet/simple/TaxonomyFacetCounts.java =================================================================== --- lucene/facet/src/java/org/apache/lucene/facet/simple/TaxonomyFacetCounts.java (revision 0) +++ lucene/facet/src/java/org/apache/lucene/facet/simple/TaxonomyFacetCounts.java (working copy) @@ -0,0 +1,230 @@ +package org.apache.lucene.facet.simple; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.List; +import java.util.Map; + +import org.apache.lucene.facet.simple.SimpleFacetsCollector.MatchingDocs; +import org.apache.lucene.facet.taxonomy.CategoryPath; +import org.apache.lucene.facet.taxonomy.ParallelTaxonomyArrays; +import org.apache.lucene.facet.taxonomy.TaxonomyReader; +import org.apache.lucene.index.BinaryDocValues; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.FixedBitSet; + +public class TaxonomyFacetCounts { + private final FacetsConfig facetsConfig; + private final TaxonomyReader taxoReader; + private final int[] counts; + private final String facetsFieldName; + private final int[] children; + private final int[] parents; + private final int[] siblings; + + public TaxonomyFacetCounts(TaxonomyReader taxoReader, FacetsConfig facetsConfig, SimpleFacetsCollector fc) throws IOException { + this(Constants.DEFAULT_FACET_FIELD, taxoReader, facetsConfig, fc); + } + + public TaxonomyFacetCounts(String facetsFieldName, TaxonomyReader taxoReader, FacetsConfig facetsConfig, SimpleFacetsCollector fc) throws IOException { + this.taxoReader = taxoReader; + this.facetsFieldName = facetsFieldName; + this.facetsConfig = facetsConfig; + ParallelTaxonomyArrays pta = taxoReader.getParallelTaxonomyArrays(); + children = pta.children(); + parents = pta.parents(); + siblings = pta.siblings(); + counts = new int[taxoReader.getSize()]; + count(fc.getMatchingDocs()); + } + + private final void count(List matchingDocs) throws IOException { + //System.out.println("count matchingDocs=" + matchingDocs + " facetsField=" + facetsFieldName); + for(MatchingDocs hits : matchingDocs) { + BinaryDocValues dv = hits.context.reader().getBinaryDocValues(facetsFieldName); + if (dv == null) { // this reader does not have DocValues for the requested category list + continue; + } + FixedBitSet bits = hits.bits; + + final int length = hits.bits.length(); + int doc = 0; + BytesRef scratch = new BytesRef(); + //System.out.println("count seg=" + hits.context.reader()); + while (doc < length && (doc = bits.nextSetBit(doc)) != -1) { + //System.out.println(" doc=" + doc); + dv.get(doc, scratch); + byte[] bytes = scratch.bytes; + int end = scratch.offset + scratch.length; + int ord = 0; + int offset = scratch.offset; + int prev = 0; + while (offset < end) { + byte b = bytes[offset++]; + if (b >= 0) { + prev = ord = ((ord << 7) | b) + prev; + assert ord < counts.length: "ord=" + ord + " vs maxOrd=" + counts.length; + ++counts[ord]; + ord = 0; + } else { + ord = (ord << 7) | (b & 0x7F); + } + } + ++doc; + } + } + + // nocommit we could do this lazily instead: + + // Rollup any necessary dims: + for(Map.Entry ent : facetsConfig.getDimConfigs().entrySet()) { + String dim = ent.getKey(); + FacetsConfig.DimConfig ft = ent.getValue(); + if (ft.hierarchical && ft.multiValued == false) { + int dimRootOrd = taxoReader.getOrdinal(new CategoryPath(dim)); + // It can be -1 if this field was declared in the + // facetsConfig but never indexed: + if (dimRootOrd > 0) { + counts[dimRootOrd] += rollup(children[dimRootOrd]); + } + } + } + } + + private int rollup(int ord) { + int sum = 0; + while (ord != TaxonomyReader.INVALID_ORDINAL) { + int childValue = counts[ord] + rollup(children[ord]); + counts[ord] = childValue; + sum += childValue; + ord = siblings[ord]; + } + return sum; + } + + /** Return the count for a specific path. Returns -1 if + * this path doesn't exist, else the count. */ + public int getSpecificCount(CategoryPath path) throws IOException { + int ord = taxoReader.getOrdinal(path); + if (ord < 0) { + return -1; + } + return counts[ord]; + } + + /** Sugar, for flat fields. */ + public SimpleFacetResult getDim(String dim, int topN) throws IOException { + return getTopChildren(new CategoryPath(dim), topN); + } + + /** Returns null if this path doesn't exist or all counts + * were 0, else topN children under the specified path. */ + public SimpleFacetResult getTopChildren(CategoryPath path, int topN) throws IOException { + int ord = taxoReader.getOrdinal(path); + if (ord == -1) { + //System.out.println("no ord for path=" + path); + return null; + } + return getTopChildren(path, ord, topN); + } + + private SimpleFacetResult getTopChildren(CategoryPath path, int dimOrd, int topN) throws IOException { + + TopOrdCountQueue q = new TopOrdCountQueue(topN); + + int bottomCount = 0; + + int ord = children[dimOrd]; + int totCount = 0; + + TopOrdCountQueue.OrdAndCount reuse = null; + while(ord != TaxonomyReader.INVALID_ORDINAL) { + if (counts[ord] > 0) { + totCount += counts[ord]; + if (counts[ord] > bottomCount) { + if (reuse == null) { + reuse = new TopOrdCountQueue.OrdAndCount(); + } + reuse.ord = ord; + reuse.count = counts[ord]; + reuse = q.insertWithOverflow(reuse); + if (q.size() == topN) { + bottomCount = q.top().count; + } + } + } + + ord = siblings[ord]; + } + + if (totCount == 0) { + //System.out.println("totCount=0 for path=" + path); + return null; + } + + FacetsConfig.DimConfig ft = facetsConfig.getDimConfig(path.components[0]); + if (ft.hierarchical && ft.multiValued) { + totCount = counts[dimOrd]; + } + + LabelAndValue[] labelValues = new LabelAndValue[q.size()]; + for(int i=labelValues.length-1;i>=0;i--) { + TopOrdCountQueue.OrdAndCount ordAndCount = q.pop(); + CategoryPath child = taxoReader.getPath(ordAndCount.ord); + labelValues[i] = new LabelAndValue(child.components[path.length], ordAndCount.count); + } + + return new SimpleFacetResult(path, totCount, labelValues); + } + + /** Returns topN labels for any dimension that had hits, + * sorted by the number of hits that dimension matched. */ + public List getAllDims(int topN) throws IOException { + int ord = children[TaxonomyReader.ROOT_ORDINAL]; + List results = new ArrayList(); + while (ord != TaxonomyReader.INVALID_ORDINAL) { + SimpleFacetResult result = getTopChildren(taxoReader.getPath(ord), ord, topN); + if (result != null) { + results.add(result); + } + ord = siblings[ord]; + } + + // Sort by highest count: + Collections.sort(results, + new Comparator() { + @Override + public int compare(SimpleFacetResult a, SimpleFacetResult b) { + if (a.value.intValue() > b.value.intValue()) { + return -1; + } else if (b.value.intValue() > a.value.intValue()) { + return 1; + } else { + // Tie break by dimension + return a.path.components[0].compareTo(b.path.components[0]); + } + } + }); + + return results; + } +} Property changes on: lucene/facet/src/java/org/apache/lucene/facet/simple/TaxonomyFacetCounts.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/facet/src/java/org/apache/lucene/facet/simple/SortedSetDocValuesFacetCounts.java =================================================================== --- lucene/facet/src/java/org/apache/lucene/facet/simple/SortedSetDocValuesFacetCounts.java (revision 0) +++ lucene/facet/src/java/org/apache/lucene/facet/simple/SortedSetDocValuesFacetCounts.java (working copy) @@ -0,0 +1,271 @@ +package org.apache.lucene.facet.simple; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.Comparator; +import java.util.Iterator; +import java.util.List; +import java.util.Map; + +import org.apache.lucene.facet.simple.SimpleFacetsCollector.MatchingDocs; +import org.apache.lucene.facet.simple.SortedSetDocValuesReaderState.OrdRange; +import org.apache.lucene.facet.simple.SortedSetDocValuesReaderState; +import org.apache.lucene.facet.taxonomy.CategoryPath; +import org.apache.lucene.index.AtomicReader; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.MultiDocValues.MultiSortedSetDocValues; +import org.apache.lucene.index.MultiDocValues; +import org.apache.lucene.index.ReaderUtil; +import org.apache.lucene.index.SortedSetDocValues; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.PriorityQueue; + +/** Compute facets counts from previously + * indexed {@link SortedSetDocValuesFacetFields}, + * without require a separate taxonomy index. Faceting is + * a bit slower (~25%), and there is added cost on every + * {@link IndexReader} open to create a new {@link + * SortedSetDocValuesReaderState}. Furthermore, this does + * not support hierarchical facets; only flat (dimension + + * label) facets, but it uses quite a bit less RAM to do + * so. + * + * After creating this class, invoke {@link #getDim} or + * {@link #getAllDims} to retrieve facet results. */ + +public class SortedSetDocValuesFacetCounts { + + final SortedSetDocValuesReaderState state; + final SortedSetDocValues dv; + final String field; + final int[] counts; + + /** Sparse faceting: returns any dimension that had any + * hits, topCount labels per dimension. */ + public SortedSetDocValuesFacetCounts(SortedSetDocValuesReaderState state, SimpleFacetsCollector hits) + throws IOException { + this.state = state; + this.field = state.getField(); + counts = new int[state.getSize()]; + dv = state.getDocValues(); + //System.out.println("field=" + field); + count(hits.getMatchingDocs()); + } + + /** Get the topN facet labels for this dimension. + * Returns null if this dimension was never seen in the + * hits. */ + public SimpleFacetResult getDim(String dim, int topN) throws IOException { + OrdRange ordRange = state.getOrdRange(dim); + if (ordRange == null) { + throw new IllegalArgumentException("dimension \"" + dim + "\" was not indexed"); + } + return getDim(dim, ordRange, topN); + } + + private final SimpleFacetResult getDim(String dim, OrdRange ordRange, int topN) { + + TopOrdCountQueue q = null; + + int bottomCount = 0; + + int dimCount = 0; + + TopOrdCountQueue.OrdAndCount reuse = null; + //System.out.println("getDim : " + ordRange.start + " - " + ordRange.end); + for(int ord=ordRange.start; ord<=ordRange.end; ord++) { + //System.out.println(" ord=" + ord + " count=" + counts[ord]); + if (counts[ord] > 0) { + dimCount += counts[ord]; + if (counts[ord] > bottomCount) { + if (reuse == null) { + reuse = new TopOrdCountQueue.OrdAndCount(); + } + reuse.ord = ord; + reuse.count = counts[ord]; + if (q == null) { + // Lazy init, so we don't create this for the + // sparse case unnecessarily + q = new TopOrdCountQueue(topN); + } + reuse = q.insertWithOverflow(reuse); + if (q.size() == topN) { + bottomCount = q.top().count; + } + } + } + } + + if (q == null) { + return null; + } + + BytesRef scratch = new BytesRef(); + + LabelAndValue[] labelValues = new LabelAndValue[q.size()]; + for(int i=labelValues.length-1;i>=0;i--) { + TopOrdCountQueue.OrdAndCount ordAndCount = q.pop(); + dv.lookupOrd(ordAndCount.ord, scratch); + String s = scratch.utf8ToString(); + labelValues[i] = new LabelAndValue(s.substring(dim.length()+1, s.length()), ordAndCount.count); + } + + return new SimpleFacetResult(new CategoryPath(dim), dimCount, labelValues); + } + + /** Does all the "real work" of tallying up the counts. */ + private final void count(List matchingDocs) throws IOException { + + for(MatchingDocs hits : matchingDocs) { + + AtomicReader reader = hits.context.reader(); + + // LUCENE-5090: make sure the provided reader context "matches" + // the top-level reader passed to the + // SortedSetDocValuesReaderState, else cryptic + // AIOOBE can happen: + if (ReaderUtil.getTopLevelContext(hits.context).reader() != state.origReader) { + throw new IllegalStateException("the SortedSetDocValuesReaderState provided to this class does not match the reader being searched; you must create a new SortedSetDocValuesReaderState every time you open a new IndexReader"); + } + + SortedSetDocValues segValues = reader.getSortedSetDocValues(field); + if (segValues == null) { + return; + } + + final int maxDoc = reader.maxDoc(); + assert maxDoc == hits.bits.length(); + + // nocommit, yet another option is to count all segs + // first, only in seg-ord space, and then do a + // merge-sort-PQ in the end to only "resolve to + // global" those seg ords that can compete, if we know + // we just want top K? ie, this is the same algo + // that'd be used for merging facets across shards + // (distributed faceting). but this has much higher + // temp ram req'ts (sum of number of ords across all + // segs) + if (dv instanceof MultiSortedSetDocValues) { + MultiDocValues.OrdinalMap ordinalMap = ((MultiSortedSetDocValues) dv).mapping; + int segOrd = hits.context.ord; + + int numSegOrds = (int) segValues.getValueCount(); + + if (hits.totalHits < numSegOrds/10) { + // Remap every ord to global ord as we iterate: + int doc = 0; + while (doc < maxDoc && (doc = hits.bits.nextSetBit(doc)) != -1) { + segValues.setDocument(doc); + int term = (int) segValues.nextOrd(); + while (term != SortedSetDocValues.NO_MORE_ORDS) { + counts[(int) ordinalMap.getGlobalOrd(segOrd, term)]++; + term = (int) segValues.nextOrd(); + } + ++doc; + } + } else { + + // First count in seg-ord space: + final int[] segCounts = new int[numSegOrds]; + int doc = 0; + while (doc < maxDoc && (doc = hits.bits.nextSetBit(doc)) != -1) { + segValues.setDocument(doc); + int term = (int) segValues.nextOrd(); + while (term != SortedSetDocValues.NO_MORE_ORDS) { + segCounts[term]++; + term = (int) segValues.nextOrd(); + } + ++doc; + } + + // Then, migrate to global ords: + for(int ord=0;ord getAllDims(int topN) throws IOException { + + List results = new ArrayList(); + for(Map.Entry ent : state.getPrefixToOrdRange().entrySet()) { + SimpleFacetResult fr = getDim(ent.getKey(), ent.getValue(), topN); + if (fr != null) { + results.add(fr); + } + } + + // Sort by highest count: + Collections.sort(results, + new Comparator() { + @Override + public int compare(SimpleFacetResult a, SimpleFacetResult b) { + if (a.value.intValue() > b.value.intValue()) { + return -1; + } else if (b.value.intValue() > a.value.intValue()) { + return 1; + } else { + // Tie break by dimension + return a.path.components[0].compareTo(b.path.components[0]); + } + } + }); + + return results; + } +} Property changes on: lucene/facet/src/java/org/apache/lucene/facet/simple/SortedSetDocValuesFacetCounts.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/facet/src/java/org/apache/lucene/facet/taxonomy/CategoryPath.java =================================================================== --- lucene/facet/src/java/org/apache/lucene/facet/taxonomy/CategoryPath.java (revision 1541880) +++ lucene/facet/src/java/org/apache/lucene/facet/taxonomy/CategoryPath.java (working copy) @@ -28,6 +28,7 @@ * * @lucene.experimental */ +// nocommit rename to FacetLabel? public class CategoryPath implements Comparable { /* Index: lucene/facet/src/test/org/apache/lucene/facet/simple/TestTaxonomyFacetsSumValueSource.java =================================================================== --- lucene/facet/src/test/org/apache/lucene/facet/simple/TestTaxonomyFacetsSumValueSource.java (revision 0) +++ lucene/facet/src/test/org/apache/lucene/facet/simple/TestTaxonomyFacetsSumValueSource.java (working copy) @@ -0,0 +1,189 @@ +package org.apache.lucene.facet.simple; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.PrintStream; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.IntField; +import org.apache.lucene.facet.FacetTestCase; +import org.apache.lucene.facet.taxonomy.CategoryPath; +import org.apache.lucene.facet.taxonomy.TaxonomyReader; +import org.apache.lucene.facet.taxonomy.TaxonomyWriter; +import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyReader; +import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter; +import org.apache.lucene.facet.util.PrintTaxonomyStats; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.RandomIndexWriter; +import org.apache.lucene.queries.function.valuesource.IntFieldSource; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.MatchAllDocsQuery; +import org.apache.lucene.search.similarities.DefaultSimilarity; +import org.apache.lucene.search.similarities.PerFieldSimilarityWrapper; +import org.apache.lucene.search.similarities.Similarity; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util._TestUtil; + +public class TestTaxonomyFacetsSumValueSource extends FacetTestCase { + + public void testBasic() throws Exception { + + Directory dir = newDirectory(); + Directory taxoDir = newDirectory(); + + // Writes facet ords to a separate directory from the + // main index: + DirectoryTaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(taxoDir, IndexWriterConfig.OpenMode.CREATE); + + IndexWriter writer = new FacetIndexWriter(dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())), taxoWriter, new FacetsConfig()); + + // Reused across documents, to add the necessary facet + // fields: + Document doc = new Document(); + doc.add(new IntField("num", 10, Field.Store.NO)); + doc.add(new FacetField("Author", "Bob")); + writer.addDocument(doc); + + doc = new Document(); + doc.add(new IntField("num", 20, Field.Store.NO)); + doc.add(new FacetField("Author", "Lisa")); + writer.addDocument(doc); + + doc = new Document(); + doc.add(new IntField("num", 30, Field.Store.NO)); + doc.add(new FacetField("Author", "Lisa")); + writer.addDocument(doc); + + doc = new Document(); + doc.add(new IntField("num", 40, Field.Store.NO)); + doc.add(new FacetField("Author", "Susan")); + writer.addDocument(doc); + + doc = new Document(); + doc.add(new IntField("num", 45, Field.Store.NO)); + doc.add(new FacetField("Author", "Frank")); + writer.addDocument(doc); + + // NRT open + IndexSearcher searcher = newSearcher(DirectoryReader.open(writer, true)); + writer.close(); + + // NRT open + TaxonomyReader taxoReader = new DirectoryTaxonomyReader(taxoWriter); + taxoWriter.close(); + + // Aggregate the facet counts: + SimpleFacetsCollector c = new SimpleFacetsCollector(); + + // MatchAllDocsQuery is for "browsing" (counts facets + // for all non-deleted docs in the index); normally + // you'd use a "normal" query, and use MultiCollector to + // wrap collecting the "normal" hits and also facets: + searcher.search(new MatchAllDocsQuery(), c); + + TaxonomyFacetSumValueSource facets = new TaxonomyFacetSumValueSource(taxoReader, new FacetsConfig(), c, new IntFieldSource("num")); + + // Retrieve & verify results: + assertEquals("Author (145.0)\n Lisa (50.0)\n Frank (45.0)\n Susan (40.0)\n Bob (10.0)\n", facets.getDim("Author", 10).toString()); + + taxoReader.close(); + searcher.getIndexReader().close(); + dir.close(); + taxoDir.close(); + } + + // LUCENE-5333 + public void testSparseFacets() throws Exception { + Directory dir = newDirectory(); + Directory taxoDir = newDirectory(); + + // Writes facet ords to a separate directory from the + // main index: + DirectoryTaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(taxoDir, IndexWriterConfig.OpenMode.CREATE); + + IndexWriter writer = new FacetIndexWriter(dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())), taxoWriter, new FacetsConfig()); + + Document doc = new Document(); + doc.add(new IntField("num", 10, Field.Store.NO)); + doc.add(new FacetField("a", "foo1")); + writer.addDocument(doc); + + if (random().nextBoolean()) { + writer.commit(); + } + + doc = new Document(); + doc.add(new IntField("num", 20, Field.Store.NO)); + doc.add(new FacetField("a", "foo2")); + doc.add(new FacetField("b", "bar1")); + writer.addDocument(doc); + + if (random().nextBoolean()) { + writer.commit(); + } + + doc = new Document(); + doc.add(new IntField("num", 30, Field.Store.NO)); + doc.add(new FacetField("a", "foo3")); + doc.add(new FacetField("b", "bar2")); + doc.add(new FacetField("c", "baz1")); + writer.addDocument(doc); + + // NRT open + IndexSearcher searcher = newSearcher(DirectoryReader.open(writer, true)); + writer.close(); + + // NRT open + TaxonomyReader taxoReader = new DirectoryTaxonomyReader(taxoWriter); + taxoWriter.close(); + + SimpleFacetsCollector c = new SimpleFacetsCollector(); + searcher.search(new MatchAllDocsQuery(), c); + + TaxonomyFacetSumValueSource facets = new TaxonomyFacetSumValueSource(taxoReader, new FacetsConfig(), c, new IntFieldSource("num")); + + // Ask for top 10 labels for any dims that have counts: + List results = facets.getAllDims(10); + + assertEquals(3, results.size()); + assertEquals("a (60.0)\n foo3 (30.0)\n foo2 (20.0)\n foo1 (10.0)\n", results.get(0).toString()); + assertEquals("b (50.0)\n bar2 (30.0)\n bar1 (20.0)\n", results.get(1).toString()); + assertEquals("c (30.0)\n baz1 (30.0)\n", results.get(2).toString()); + + searcher.getIndexReader().close(); + taxoReader.close(); + taxoDir.close(); + dir.close(); + } + + // nocommit in the sparse case test that we are really + // sorting by the correct dim count +} Property changes on: lucene/facet/src/test/org/apache/lucene/facet/simple/TestTaxonomyFacetsSumValueSource.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/facet/src/test/org/apache/lucene/facet/simple/TestSortedSetDocValuesFacets.java =================================================================== --- lucene/facet/src/test/org/apache/lucene/facet/simple/TestSortedSetDocValuesFacets.java (revision 0) +++ lucene/facet/src/test/org/apache/lucene/facet/simple/TestSortedSetDocValuesFacets.java (working copy) @@ -0,0 +1,191 @@ +package org.apache.lucene.facet.simple; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.facet.FacetTestCase; +import org.apache.lucene.facet.search.FacetsCollector; +import org.apache.lucene.facet.simple.SortedSetDocValuesFacetCounts; +import org.apache.lucene.facet.simple.SortedSetDocValuesReaderState; +import org.apache.lucene.facet.taxonomy.CategoryPath; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.RandomIndexWriter; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.MatchAllDocsQuery; +import org.apache.lucene.search.TopDocs; +import org.apache.lucene.store.Directory; + +public class TestSortedSetDocValuesFacets extends FacetTestCase { + + // NOTE: TestDrillSideways.testRandom also sometimes + // randomly uses SortedSetDV + + public void testBasic() throws Exception { + assumeTrue("Test requires SortedSetDV support", defaultCodecSupportsSortedSet()); + Directory dir = newDirectory(); + + IndexWriter writer = new FacetIndexWriter(dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()))); + + Document doc = new Document(); + doc.add(new SortedSetDocValuesFacetField("a", "foo")); + doc.add(new SortedSetDocValuesFacetField("a", "bar")); + doc.add(new SortedSetDocValuesFacetField("a", "zoo")); + doc.add(new SortedSetDocValuesFacetField("b", "baz")); + writer.addDocument(doc); + if (random().nextBoolean()) { + writer.commit(); + } + + doc = new Document(); + doc.add(new SortedSetDocValuesFacetField("a", "foo")); + writer.addDocument(doc); + + // NRT open + IndexSearcher searcher = newSearcher(DirectoryReader.open(writer, true)); + writer.close(); + + // Per-top-reader state: + SortedSetDocValuesReaderState state = new SortedSetDocValuesReaderState(searcher.getIndexReader()); + + SimpleFacetsCollector c = new SimpleFacetsCollector(); + + searcher.search(new MatchAllDocsQuery(), c); + + SortedSetDocValuesFacetCounts facets = new SortedSetDocValuesFacetCounts(state, c); + + assertEquals("a (4)\n foo (2)\n bar (1)\n zoo (1)\n", facets.getDim("a", 10).toString()); + assertEquals("b (1)\n baz (1)\n", facets.getDim("b", 10).toString()); + + // DrillDown: + SimpleDrillDownQuery q = new SimpleDrillDownQuery(); + q.add(new CategoryPath("a", "foo")); + q.add(new CategoryPath("b", "baz")); + TopDocs hits = searcher.search(q, 1); + assertEquals(1, hits.totalHits); + + searcher.getIndexReader().close(); + dir.close(); + } + + // LUCENE-5090 + public void testStaleState() throws Exception { + assumeTrue("Test requires SortedSetDV support", defaultCodecSupportsSortedSet()); + Directory dir = newDirectory(); + + IndexWriter writer = new FacetIndexWriter(dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()))); + + Document doc = new Document(); + doc.add(new SortedSetDocValuesFacetField("a", "foo")); + writer.addDocument(doc); + + IndexReader r = DirectoryReader.open(writer, true); + SortedSetDocValuesReaderState state = new SortedSetDocValuesReaderState(r); + + doc = new Document(); + doc.add(new SortedSetDocValuesFacetField("a", "bar")); + writer.addDocument(doc); + + doc = new Document(); + doc.add(new SortedSetDocValuesFacetField("a", "baz")); + writer.addDocument(doc); + + IndexSearcher searcher = newSearcher(DirectoryReader.open(writer, true)); + + SimpleFacetsCollector c = new SimpleFacetsCollector(); + + searcher.search(new MatchAllDocsQuery(), c); + + try { + new SortedSetDocValuesFacetCounts(state, c); + fail("did not hit expected exception"); + } catch (IllegalStateException ise) { + // expected + } + + r.close(); + writer.close(); + searcher.getIndexReader().close(); + dir.close(); + } + + // LUCENE-5333 + public void testSparseFacets() throws Exception { + assumeTrue("Test requires SortedSetDV support", defaultCodecSupportsSortedSet()); + Directory dir = newDirectory(); + + IndexWriter writer = new FacetIndexWriter(dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()))); + + Document doc = new Document(); + doc.add(new SortedSetDocValuesFacetField("a", "foo1")); + writer.addDocument(doc); + + if (random().nextBoolean()) { + writer.commit(); + } + + doc = new Document(); + doc.add(new SortedSetDocValuesFacetField("a", "foo2")); + doc.add(new SortedSetDocValuesFacetField("b", "bar1")); + writer.addDocument(doc); + + if (random().nextBoolean()) { + writer.commit(); + } + + doc = new Document(); + doc.add(new SortedSetDocValuesFacetField("a", "foo3")); + doc.add(new SortedSetDocValuesFacetField("b", "bar2")); + doc.add(new SortedSetDocValuesFacetField("c", "baz1")); + writer.addDocument(doc); + + // NRT open + IndexSearcher searcher = newSearcher(DirectoryReader.open(writer, true)); + writer.close(); + + // Per-top-reader state: + SortedSetDocValuesReaderState state = new SortedSetDocValuesReaderState(searcher.getIndexReader()); + + SimpleFacetsCollector c = new SimpleFacetsCollector(); + searcher.search(new MatchAllDocsQuery(), c); + SortedSetDocValuesFacetCounts facets = new SortedSetDocValuesFacetCounts(state, c); + + // Ask for top 10 labels for any dims that have counts: + List results = facets.getAllDims(10); + + assertEquals(3, results.size()); + assertEquals("a (3)\n foo1 (1)\n foo2 (1)\n foo3 (1)\n", results.get(0).toString()); + assertEquals("b (2)\n bar1 (1)\n bar2 (1)\n", results.get(1).toString()); + assertEquals("c (1)\n baz1 (1)\n", results.get(2).toString()); + + searcher.getIndexReader().close(); + dir.close(); + } + + // nocommit test different delim char & using the default + // one in a dim + + // nocommit in the sparse case test that we are really + // sorting by the correct dim count +} Property changes on: lucene/facet/src/test/org/apache/lucene/facet/simple/TestSortedSetDocValuesFacets.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/facet/src/test/org/apache/lucene/facet/simple/TestRangeFacets.java =================================================================== --- lucene/facet/src/test/org/apache/lucene/facet/simple/TestRangeFacets.java (revision 0) +++ lucene/facet/src/test/org/apache/lucene/facet/simple/TestRangeFacets.java (working copy) @@ -0,0 +1,102 @@ +package org.apache.lucene.facet.simple; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Collections; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import org.apache.lucene.document.Document; +import org.apache.lucene.document.DoubleDocValuesField; +import org.apache.lucene.document.DoubleField; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.FloatDocValuesField; +import org.apache.lucene.document.FloatField; +import org.apache.lucene.document.LongField; +import org.apache.lucene.document.NumericDocValuesField; +import org.apache.lucene.facet.FacetTestCase; +import org.apache.lucene.facet.FacetTestUtils; +import org.apache.lucene.facet.index.FacetFields; +import org.apache.lucene.facet.params.FacetIndexingParams; +import org.apache.lucene.facet.params.FacetSearchParams; +import org.apache.lucene.facet.range.LongRange; +import org.apache.lucene.facet.search.CountFacetRequest; +import org.apache.lucene.facet.search.DrillDownQuery; +import org.apache.lucene.facet.search.DrillSideways.DrillSidewaysResult; +import org.apache.lucene.facet.search.DrillSideways; +import org.apache.lucene.facet.search.FacetRequest; +import org.apache.lucene.facet.search.FacetResult; +import org.apache.lucene.facet.search.FacetResultNode; +import org.apache.lucene.facet.search.FacetsAccumulator; +import org.apache.lucene.facet.search.FacetsCollector; +import org.apache.lucene.facet.sortedset.SortedSetDocValuesReaderState; +import org.apache.lucene.facet.taxonomy.CategoryPath; +import org.apache.lucene.facet.taxonomy.TaxonomyReader; +import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyReader; +import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.RandomIndexWriter; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.MatchAllDocsQuery; +import org.apache.lucene.search.NumericRangeQuery; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util._TestUtil; + +public class TestRangeFacets extends FacetTestCase { + + public void testBasicLong() throws Exception { + Directory d = newDirectory(); + RandomIndexWriter w = new RandomIndexWriter(random(), d); + Document doc = new Document(); + NumericDocValuesField field = new NumericDocValuesField("field", 0L); + doc.add(field); + for(long l=0;l<100;l++) { + field.setLongValue(l); + w.addDocument(doc); + } + field.setLongValue(Long.MAX_VALUE); + w.addDocument(doc); + + IndexReader r = w.getReader(); + w.close(); + + SimpleFacetsCollector fc = new SimpleFacetsCollector(); + IndexSearcher s = newSearcher(r); + s.search(new MatchAllDocsQuery(), fc); + + RangeFacetCounts facets = new RangeFacetCounts("field", fc, + new LongRange("less than 10", 0L, true, 10L, false), + new LongRange("less than or equal to 10", 0L, true, 10L, true), + new LongRange("over 90", 90L, false, 100L, false), + new LongRange("90 or above", 90L, true, 100L, false), + new LongRange("over 1000", 1000L, false, Long.MAX_VALUE, true)); + + SimpleFacetResult result = facets.getCounts(); + assertEquals("null (101)\n less than 10 (10)\n less than or equal to 10 (11)\n over 90 (9)\n 90 or above (10)\n over 1000 (1)\n", + result.toString()); + + r.close(); + d.close(); + } + + // nocommit pull over all the other tests +} Property changes on: lucene/facet/src/test/org/apache/lucene/facet/simple/TestRangeFacets.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/facet/src/test/org/apache/lucene/facet/simple/TestTaxonomyFacets.java =================================================================== --- lucene/facet/src/test/org/apache/lucene/facet/simple/TestTaxonomyFacets.java (revision 0) +++ lucene/facet/src/test/org/apache/lucene/facet/simple/TestTaxonomyFacets.java (working copy) @@ -0,0 +1,371 @@ +package org.apache.lucene.facet.simple; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.PrintStream; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.facet.FacetTestCase; +import org.apache.lucene.facet.taxonomy.CategoryPath; +import org.apache.lucene.facet.taxonomy.TaxonomyReader; +import org.apache.lucene.facet.taxonomy.TaxonomyWriter; +import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyReader; +import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter; +import org.apache.lucene.facet.util.PrintTaxonomyStats; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.RandomIndexWriter; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.MatchAllDocsQuery; +import org.apache.lucene.search.similarities.DefaultSimilarity; +import org.apache.lucene.search.similarities.PerFieldSimilarityWrapper; +import org.apache.lucene.search.similarities.Similarity; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util._TestUtil; + +public class TestTaxonomyFacets extends FacetTestCase { + + public void testBasic() throws Exception { + Directory dir = newDirectory(); + Directory taxoDir = newDirectory(); + + // Writes facet ords to a separate directory from the + // main index: + DirectoryTaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(taxoDir, IndexWriterConfig.OpenMode.CREATE); + + FacetsConfig fts = new FacetsConfig(); + fts.setHierarchical("Publish Date"); + + IndexWriter writer = new FacetIndexWriter(dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())), taxoWriter, fts); + + // Reused across documents, to add the necessary facet + // fields: + Document doc = new Document(); + doc.add(new FacetField("Author", "Bob")); + doc.add(new FacetField("Publish Date", "2010", "10", "15")); + writer.addDocument(doc); + + doc = new Document(); + doc.add(new FacetField("Author", "Lisa")); + doc.add(new FacetField("Publish Date", "2010", "10", "20")); + writer.addDocument(doc); + + doc = new Document(); + doc.add(new FacetField("Author", "Lisa")); + doc.add(new FacetField("Publish Date", "2012", "1", "1")); + writer.addDocument(doc); + + doc = new Document(); + doc.add(new FacetField("Author", "Susan")); + doc.add(new FacetField("Publish Date", "2012", "1", "7")); + writer.addDocument(doc); + + doc = new Document(); + doc.add(new FacetField("Author", "Frank")); + doc.add(new FacetField("Publish Date", "1999", "5", "5")); + writer.addDocument(doc); + + // NRT open + IndexSearcher searcher = newSearcher(DirectoryReader.open(writer, true)); + writer.close(); + + // NRT open + TaxonomyReader taxoReader = new DirectoryTaxonomyReader(taxoWriter); + taxoWriter.close(); + + // Aggregate the facet counts: + SimpleFacetsCollector c = new SimpleFacetsCollector(); + + // MatchAllDocsQuery is for "browsing" (counts facets + // for all non-deleted docs in the index); normally + // you'd use a "normal" query, and use MultiCollector to + // wrap collecting the "normal" hits and also facets: + searcher.search(new MatchAllDocsQuery(), c); + + TaxonomyFacetCounts facets = new TaxonomyFacetCounts(taxoReader, fts, c); + + // Retrieve & verify results: + assertEquals("Publish Date (5)\n 2010 (2)\n 2012 (2)\n 1999 (1)\n", facets.getDim("Publish Date", 10).toString()); + assertEquals("Author (5)\n Lisa (2)\n Bob (1)\n Susan (1)\n Frank (1)\n", facets.getDim("Author", 10).toString()); + + // Now user drills down on Publish Date/2010: + SimpleDrillDownQuery q2 = new SimpleDrillDownQuery(new MatchAllDocsQuery()); + q2.add(new CategoryPath("Publish Date", "2010")); + c = new SimpleFacetsCollector(); + searcher.search(q2, c); + facets = new TaxonomyFacetCounts(taxoReader, fts, c); + assertEquals("Author (2)\n Bob (1)\n Lisa (1)\n", facets.getDim("Author", 10).toString()); + + assertEquals(1, facets.getSpecificCount(new CategoryPath("Author", "Lisa"))); + + // Smoke test PrintTaxonomyStats: + ByteArrayOutputStream bos = new ByteArrayOutputStream(); + PrintTaxonomyStats.printStats(taxoReader, new PrintStream(bos, false, "UTF-8"), true); + String result = bos.toString("UTF-8"); + assertTrue(result.indexOf("/Author: 4 immediate children; 5 total categories") != -1); + assertTrue(result.indexOf("/Publish Date: 3 immediate children; 12 total categories") != -1); + // Make sure at least a few nodes of the tree came out: + assertTrue(result.indexOf(" /1999") != -1); + assertTrue(result.indexOf(" /2012") != -1); + assertTrue(result.indexOf(" /20") != -1); + + taxoReader.close(); + searcher.getIndexReader().close(); + dir.close(); + taxoDir.close(); + } + + // LUCENE-5333 + public void testSparseFacets() throws Exception { + Directory dir = newDirectory(); + Directory taxoDir = newDirectory(); + + // Writes facet ords to a separate directory from the + // main index: + DirectoryTaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(taxoDir, IndexWriterConfig.OpenMode.CREATE); + + IndexWriter writer = new FacetIndexWriter(dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())), taxoWriter, new FacetsConfig()); + + Document doc = new Document(); + doc.add(new FacetField("a", "foo1")); + writer.addDocument(doc); + + if (random().nextBoolean()) { + writer.commit(); + } + + doc = new Document(); + doc.add(new FacetField("a", "foo2")); + doc.add(new FacetField("b", "bar1")); + writer.addDocument(doc); + + if (random().nextBoolean()) { + writer.commit(); + } + + doc = new Document(); + doc.add(new FacetField("a", "foo3")); + doc.add(new FacetField("b", "bar2")); + doc.add(new FacetField("c", "baz1")); + writer.addDocument(doc); + + // NRT open + IndexSearcher searcher = newSearcher(DirectoryReader.open(writer, true)); + writer.close(); + + // NRT open + TaxonomyReader taxoReader = new DirectoryTaxonomyReader(taxoWriter); + taxoWriter.close(); + + SimpleFacetsCollector c = new SimpleFacetsCollector(); + searcher.search(new MatchAllDocsQuery(), c); + + TaxonomyFacetCounts facets = new TaxonomyFacetCounts(taxoReader, new FacetsConfig(), c); + + // Ask for top 10 labels for any dims that have counts: + List results = facets.getAllDims(10); + + assertEquals(3, results.size()); + assertEquals("a (3)\n foo1 (1)\n foo2 (1)\n foo3 (1)\n", results.get(0).toString()); + assertEquals("b (2)\n bar1 (1)\n bar2 (1)\n", results.get(1).toString()); + assertEquals("c (1)\n baz1 (1)\n", results.get(2).toString()); + + searcher.getIndexReader().close(); + taxoReader.close(); + taxoDir.close(); + dir.close(); + } + + // nocommit in the sparse case test that we are really + // sorting by the correct dim count + + /* + public void testReallyNoNormsForDrillDown() throws Exception { + Directory dir = newDirectory(); + Directory taxoDir = newDirectory(); + IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())); + iwc.setSimilarity(new PerFieldSimilarityWrapper() { + final Similarity sim = new DefaultSimilarity(); + + @Override + public Similarity get(String name) { + assertEquals("field", name); + return sim; + } + }); + RandomIndexWriter writer = new RandomIndexWriter(random(), dir, iwc); + TaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(taxoDir, IndexWriterConfig.OpenMode.CREATE); + FacetFields facetFields = new FacetFields(taxoWriter); + + Document doc = new Document(); + doc.add(newTextField("field", "text", Field.Store.NO)); + facetFields.addFields(doc, Collections.singletonList(new CategoryPath("a/path", '/'))); + writer.addDocument(doc); + writer.close(); + taxoWriter.close(); + dir.close(); + taxoDir.close(); + } + + public void testAllParents() throws Exception { + Directory dir = newDirectory(); + Directory taxoDir = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random(), dir); + DirectoryTaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(taxoDir, IndexWriterConfig.OpenMode.CREATE); + + CategoryListParams clp = new CategoryListParams("$facets") { + @Override + public OrdinalPolicy getOrdinalPolicy(String fieldName) { + return OrdinalPolicy.ALL_PARENTS; + } + }; + FacetIndexingParams fip = new FacetIndexingParams(clp); + + FacetFields facetFields = new FacetFields(taxoWriter, fip); + + Document doc = new Document(); + doc.add(newTextField("field", "text", Field.Store.NO)); + facetFields.addFields(doc, Collections.singletonList(new CategoryPath("a/path", '/'))); + writer.addDocument(doc); + + // NRT open + IndexSearcher searcher = newSearcher(writer.getReader()); + writer.close(); + + // NRT open + TaxonomyReader taxoReader = new DirectoryTaxonomyReader(taxoWriter); + taxoWriter.close(); + + FacetSearchParams fsp = new FacetSearchParams(fip, + new CountFacetRequest(new CategoryPath("a", '/'), 10)); + + // Aggregate the facet counts: + FacetsCollector c = FacetsCollector.create(fsp, searcher.getIndexReader(), taxoReader); + + // MatchAllDocsQuery is for "browsing" (counts facets + // for all non-deleted docs in the index); normally + // you'd use a "normal" query, and use MultiCollector to + // wrap collecting the "normal" hits and also facets: + searcher.search(new MatchAllDocsQuery(), c); + List results = c.getFacetResults(); + assertEquals(1, results.size()); + assertEquals(1, (int) results.get(0).getFacetResultNode().value); + + // LUCENE-4913: + for(FacetResultNode childNode : results.get(0).getFacetResultNode().subResults) { + assertTrue(childNode.ordinal != 0); + } + + searcher.getIndexReader().close(); + taxoReader.close(); + dir.close(); + taxoDir.close(); + } + + public void testLabelWithDelimiter() throws Exception { + Directory dir = newDirectory(); + Directory taxoDir = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random(), dir); + DirectoryTaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(taxoDir, IndexWriterConfig.OpenMode.CREATE); + + FacetFields facetFields = new FacetFields(taxoWriter); + + Document doc = new Document(); + doc.add(newTextField("field", "text", Field.Store.NO)); + BytesRef br = new BytesRef(new byte[] {(byte) 0xee, (byte) 0x92, (byte) 0xaa, (byte) 0xef, (byte) 0x9d, (byte) 0x89}); + facetFields.addFields(doc, Collections.singletonList(new CategoryPath("dim/" + br.utf8ToString(), '/'))); + try { + writer.addDocument(doc); + } catch (IllegalArgumentException iae) { + // expected + } + writer.close(); + taxoWriter.close(); + dir.close(); + taxoDir.close(); + } + + // LUCENE-4583: make sure if we require > 32 KB for one + // document, we don't hit exc when using Facet42DocValuesFormat + public void testManyFacetsInOneDocument() throws Exception { + assumeTrue("default Codec doesn't support huge BinaryDocValues", _TestUtil.fieldSupportsHugeBinaryDocValues(CategoryListParams.DEFAULT_FIELD)); + Directory dir = newDirectory(); + Directory taxoDir = newDirectory(); + IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())); + RandomIndexWriter writer = new RandomIndexWriter(random(), dir, iwc); + DirectoryTaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(taxoDir, IndexWriterConfig.OpenMode.CREATE); + + FacetFields facetFields = new FacetFields(taxoWriter); + + int numLabels = _TestUtil.nextInt(random(), 40000, 100000); + + Document doc = new Document(); + doc.add(newTextField("field", "text", Field.Store.NO)); + List paths = new ArrayList(); + for (int i = 0; i < numLabels; i++) { + paths.add(new CategoryPath("dim", "" + i)); + } + facetFields.addFields(doc, paths); + writer.addDocument(doc); + + // NRT open + IndexSearcher searcher = newSearcher(writer.getReader()); + writer.close(); + + // NRT open + TaxonomyReader taxoReader = new DirectoryTaxonomyReader(taxoWriter); + taxoWriter.close(); + + FacetSearchParams fsp = new FacetSearchParams(new CountFacetRequest(new CategoryPath("dim"), Integer.MAX_VALUE)); + + // Aggregate the facet counts: + FacetsCollector c = FacetsCollector.create(fsp, searcher.getIndexReader(), taxoReader); + + // MatchAllDocsQuery is for "browsing" (counts facets + // for all non-deleted docs in the index); normally + // you'd use a "normal" query, and use MultiCollector to + // wrap collecting the "normal" hits and also facets: + searcher.search(new MatchAllDocsQuery(), c); + List results = c.getFacetResults(); + assertEquals(1, results.size()); + FacetResultNode root = results.get(0).getFacetResultNode(); + assertEquals(numLabels, root.subResults.size()); + Set allLabels = new HashSet(); + for (FacetResultNode childNode : root.subResults) { + assertEquals(2, childNode.label.length); + allLabels.add(childNode.label.components[1]); + assertEquals(1, (int) childNode.value); + } + assertEquals(numLabels, allLabels.size()); + + IOUtils.close(searcher.getIndexReader(), taxoReader, dir, taxoDir); + } + */ +} Property changes on: lucene/facet/src/test/org/apache/lucene/facet/simple/TestTaxonomyFacets.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property