diff --git lucene/CHANGES.txt lucene/CHANGES.txt index 1894ee5..357d9dc 100644 --- lucene/CHANGES.txt +++ lucene/CHANGES.txt @@ -161,6 +161,10 @@ New Features * LUCENE-4975: Added a new Replicator module which can replicate index revisions between server and client. (Shai Erera, Mike McCandless) +* LUCENE-5022: Added FacetResult.mergeHierarchies to merge multiple + FacetResult of the same dimension into a single one with the reconstructed + hierarchy. (Shai Erera) + Build * LUCENE-4987: Upgrade randomized testing to version 2.0.10: diff --git lucene/facet/src/java/org/apache/lucene/facet/search/FacetResult.java lucene/facet/src/java/org/apache/lucene/facet/search/FacetResult.java index 9a010fa..ae7c1d1 100644 --- lucene/facet/src/java/org/apache/lucene/facet/search/FacetResult.java +++ lucene/facet/src/java/org/apache/lucene/facet/search/FacetResult.java @@ -1,5 +1,16 @@ package org.apache.lucene.facet.search; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Comparator; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.apache.lucene.facet.taxonomy.CategoryPath; +import org.apache.lucene.facet.taxonomy.TaxonomyReader; +import org.apache.lucene.util.CollectionUtil; + /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with @@ -24,6 +35,140 @@ package org.apache.lucene.facet.search; */ public class FacetResult { + private static FacetResultNode addIfNotExist(Map nodes, FacetResultNode node) { + FacetResultNode n = nodes.get(node.label); + if (n == null) { + nodes.put(node.label, node); + n = node; + } + return n; + } + + /** + * A utility for merging multiple {@link FacetResult} of the same + * (hierarchical) dimension into a single {@link FacetResult}, to reconstruct + * the hierarchy. The results are merged according to the following rules: + * + *

+ * NOTE: the returned results are not guaranteed to be in the same + * order of the input ones. + * + * @param results + * the results to merge + * @param taxoReader + * the {@link TaxonomyReader} to use when creating missing ancestor + * nodes + * @param dimArrays + * a mapping from a dimension to the respective {@link FacetArrays} + * from which to pull the nodes values + */ + public static List mergeHierarchies(List results, TaxonomyReader taxoReader, + Map dimArrays) throws IOException { + final Map> dims = new HashMap<>(); + for (FacetResult fr : results) { + String dim = fr.getFacetRequest().categoryPath.components[0]; + List frs = dims.get(dim); + if (frs == null) { + frs = new ArrayList<>(); + dims.put(dim, frs); + } + frs.add(fr); + } + + final List res = new ArrayList<>(); + for (List frs : dims.values()) { + FacetResult mergedResult = frs.get(0); + if (frs.size() > 1) { + CollectionUtil.introSort(frs, new Comparator() { + @Override + public int compare(FacetResult fr1, FacetResult fr2) { + return fr1.getFacetRequest().categoryPath.compareTo(fr2.getFacetRequest().categoryPath); + } + }); + Map mergedNodes = new HashMap<>(); + FacetArrays arrays = dimArrays != null ? dimArrays.get(frs.get(0).getFacetRequest().categoryPath.components[0]) : null; + for (FacetResult fr : frs) { + FacetResultNode frn = fr.getFacetResultNode(); + FacetResultNode merged = mergedNodes.get(frn.label); + if (merged == null) { + CategoryPath parent = frn.label.subpath(frn.label.length - 1); + FacetResultNode childNode = frn; + FacetResultNode parentNode = null; + while (parent.length > 0 && (parentNode = mergedNodes.get(parent)) == null) { + int parentOrd = taxoReader.getOrdinal(parent); + double parentValue = arrays != null ? fr.getFacetRequest().getValueOf(arrays, parentOrd) : -1; + parentNode = new FacetResultNode(parentOrd, parentValue); + parentNode.label = parent; + parentNode.subResults = new ArrayList<>(); + parentNode.subResults.add(childNode); + mergedNodes.put(parent, parentNode); + childNode = parentNode; + parent = parent.subpath(parent.length - 1); + } + + // at least one parent was added, so link the final (existing) + // parent with the child + if (parent.length > 0) { + if (!(parentNode.subResults instanceof ArrayList)) { + parentNode.subResults = new ArrayList<>(parentNode.subResults); + } + parentNode.subResults.add(childNode); + } + + // for missing FRNs, add new ones with label and value=-1 + // first time encountered this label, add it and all its children to + // the map. + mergedNodes.put(frn.label, frn); + for (FacetResultNode child : frn.subResults) { + addIfNotExist(mergedNodes, child); + } + } else { + if (!(merged.subResults instanceof ArrayList)) { + merged.subResults = new ArrayList<>(merged.subResults); + } + for (FacetResultNode sub : frn.subResults) { + // make sure sub wasn't already added + sub = addIfNotExist(mergedNodes, sub); + if (!merged.subResults.contains(sub)) { + merged.subResults.add(sub); + } + } + } + } + + // find the 'first' node to put on the FacetResult root + CategoryPath min = null; + for (CategoryPath cp : mergedNodes.keySet()) { + if (min == null || cp.compareTo(min) < 0) { + min = cp; + } + } + FacetRequest dummy = new FacetRequest(min, frs.get(0).getFacetRequest().numResults) { + @Override + public double getValueOf(FacetArrays arrays, int idx) { + throw new UnsupportedOperationException("not supported by this request"); + } + + @Override + public FacetArraysSource getFacetArraysSource() { + throw new UnsupportedOperationException("not supported by this request"); + } + }; + mergedResult = new FacetResult(dummy, mergedNodes.get(min), -1); + } + res.add(mergedResult); + } + return res; + } + private final FacetRequest facetRequest; private final FacetResultNode rootNode; private final int numValidDescendants; diff --git lucene/facet/src/test/org/apache/lucene/facet/search/FacetResultTest.java lucene/facet/src/test/org/apache/lucene/facet/search/FacetResultTest.java new file mode 100755 index 0000000..01084d5 --- /dev/null +++ lucene/facet/src/test/org/apache/lucene/facet/search/FacetResultTest.java @@ -0,0 +1,204 @@ +package org.apache.lucene.facet.search; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Comparator; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.facet.FacetTestCase; +import org.apache.lucene.facet.FacetTestUtils; +import org.apache.lucene.facet.index.FacetFields; +import org.apache.lucene.facet.params.FacetIndexingParams; +import org.apache.lucene.facet.params.FacetSearchParams; +import org.apache.lucene.facet.search.DrillSideways.DrillSidewaysResult; +import org.apache.lucene.facet.taxonomy.CategoryPath; +import org.apache.lucene.facet.taxonomy.TaxonomyReader; +import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyReader; +import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.MatchAllDocsQuery; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.RAMDirectory; +import org.apache.lucene.util.CollectionUtil; +import org.apache.lucene.util.IOUtils; +import org.junit.Test; + +public class FacetResultTest extends FacetTestCase { + + private Document newDocument(FacetFields facetFields, String... categories) throws IOException { + Document doc = new Document(); + List cats = new ArrayList(); + for (String cat : categories) { + cats.add(new CategoryPath(cat, '/')); + } + facetFields.addFields(doc, cats); + return doc; + } + + private void initIndex(Directory indexDir, Directory taxoDir) throws IOException { + IndexWriterConfig conf = new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())); + IndexWriter indexWriter = new IndexWriter(indexDir, conf); + DirectoryTaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(taxoDir); + FacetFields facetFields = new FacetFields(taxoWriter); + indexWriter.addDocument(newDocument(facetFields, "Date/2010/March/12", "A/1")); + indexWriter.addDocument(newDocument(facetFields, "Date/2010/March/23", "A/2")); + indexWriter.addDocument(newDocument(facetFields, "Date/2010/April/17", "A/3")); + indexWriter.addDocument(newDocument(facetFields, "Date/2010/May/18", "A/1")); + indexWriter.addDocument(newDocument(facetFields, "Date/2011/January/1", "A/3")); + indexWriter.addDocument(newDocument(facetFields, "Date/2011/February/12", "A/1")); + indexWriter.addDocument(newDocument(facetFields, "Date/2011/February/18", "A/4")); + indexWriter.addDocument(newDocument(facetFields, "Date/2012/August/15", "A/1")); + indexWriter.addDocument(newDocument(facetFields, "Date/2012/July/5", "A/2")); + indexWriter.addDocument(newDocument(facetFields, "Date/2013/September/13", "A/1")); + indexWriter.addDocument(newDocument(facetFields, "Date/2013/September/25", "A/4")); + IOUtils.close(indexWriter, taxoWriter); + } + + private void searchIndex(TaxonomyReader taxoReader, IndexSearcher searcher, boolean fillMissingCounts, String[] exp, + String[][] drillDowns, int[] numResults) throws IOException { + CategoryPath[][] cps = new CategoryPath[drillDowns.length][]; + for (int i = 0; i < cps.length; i++) { + cps[i] = new CategoryPath[drillDowns[i].length]; + for (int j = 0; j < cps[i].length; j++) { + cps[i][j] = new CategoryPath(drillDowns[i][j], '/'); + } + } + DrillDownQuery ddq = new DrillDownQuery(FacetIndexingParams.DEFAULT, new MatchAllDocsQuery()); + for (CategoryPath[] cats : cps) { + ddq.add(cats); + } + + List facetRequests = new ArrayList(); + for (CategoryPath[] cats : cps) { + for (int i = 0; i < cats.length; i++) { + CategoryPath cp = cats[i]; + int numres = numResults == null ? 2 : numResults[i]; + // for each drill-down, add itself as well as its parent as requests, so + // we get the drill-sideways + facetRequests.add(new CountFacetRequest(cp, numres)); + CountFacetRequest parent = new CountFacetRequest(cp.subpath(cp.length - 1), numres); + if (!facetRequests.contains(parent) && parent.categoryPath.length > 0) { + facetRequests.add(parent); + } + } + } + + FacetSearchParams fsp = new FacetSearchParams(facetRequests); + final DrillSideways ds; + final Map dimArrays; + if (fillMissingCounts) { + dimArrays = new HashMap(); + ds = new DrillSideways(searcher, taxoReader) { + @Override + protected FacetsAccumulator getDrillSidewaysAccumulator(String dim, FacetSearchParams fsp) throws IOException { + FacetsAccumulator fa = super.getDrillSidewaysAccumulator(dim, fsp); + dimArrays.put(dim, fa.facetArrays); + return fa; + } + }; + } else { + ds = new DrillSideways(searcher, taxoReader); + dimArrays = null; + } + + final DrillSidewaysResult sidewaysRes = ds.search(null, ddq, 5, fsp); + List facetResults = FacetResult.mergeHierarchies(sidewaysRes.facetResults, taxoReader, dimArrays); + CollectionUtil.introSort(facetResults, new Comparator() { + @Override + public int compare(FacetResult o1, FacetResult o2) { + return o1.getFacetRequest().categoryPath.compareTo(o2.getFacetRequest().categoryPath); + } + }); + assertEquals(exp.length, facetResults.size()); // A + single one for date + for (int i = 0; i < facetResults.size(); i++) { + assertEquals(exp[i], FacetTestUtils.toSimpleString(facetResults.get(i))); + } + } + + @Test + public void testMergeHierarchies() throws Exception { + Directory indexDir = new RAMDirectory(), taxoDir = new RAMDirectory(); + initIndex(indexDir, taxoDir); + + DirectoryReader indexReader = DirectoryReader.open(indexDir); + TaxonomyReader taxoReader = new DirectoryTaxonomyReader(taxoDir); + IndexSearcher searcher = new IndexSearcher(indexReader); + + String[] exp = new String[] { "Date (0)\n 2010 (4)\n 2011 (3)\n" }; + searchIndex(taxoReader, searcher, false, exp, new String[][] { new String[] { "Date" } }, null); + + // two dimensions + exp = new String[] { "A (0)\n 1 (5)\n 4 (2)\n", "Date (0)\n 2010 (4)\n 2011 (3)\n" }; + searchIndex(taxoReader, searcher, false, exp, new String[][] { new String[] { "Date" }, new String[] { "A" } }, null); + + // both parent and child are OR'd + exp = new String[] { "Date (-1)\n 2010 (4)\n March (2)\n 23 (1)\n 12 (1)\n May (1)\n" }; + searchIndex(taxoReader, searcher, false, exp, new String[][] { new String[] { "Date/2010/March", "Date/2010/March/23" }}, null); + + // both parent and child are OR'd (fill counts) + exp = new String[] { "Date (0)\n 2010 (4)\n March (2)\n 23 (1)\n 12 (1)\n May (1)\n" }; + searchIndex(taxoReader, searcher, true, exp, new String[][] { new String[] { "Date/2010/March", "Date/2010/March/23" }}, null); + + // same DD twice + exp = new String[] { "Date (0)\n 2010 (4)\n March (2)\n May (1)\n 2011 (3)\n" }; + searchIndex(taxoReader, searcher, false, exp, new String[][] { new String[] { "Date/2010", "Date/2010" }}, null); + + exp = new String[] { "Date (0)\n 2010 (4)\n March (2)\n May (1)\n 2011 (3)\n" }; + searchIndex(taxoReader, searcher, false, exp, new String[][] { new String[] { "Date/2010" }}, null); + + exp = new String[] { "Date (0)\n 2010 (4)\n March (2)\n May (1)\n 2011 (3)\n February (2)\n January (1)\n" }; + searchIndex(taxoReader, searcher, false, exp, new String[][] { new String[] { "Date/2010", "Date/2011" }}, null); + + exp = new String[] { "Date (0)\n 2010 (4)\n March (2)\n 23 (1)\n 12 (1)\n May (1)\n 2011 (3)\n February (2)\n January (1)\n" }; + searchIndex(taxoReader, searcher, false, exp, new String[][] { new String[] { "Date/2010/March", "Date/2011" }}, null); + + // Date/2010/April not in top-2 of Date/2010 + exp = new String[] { "Date (0)\n 2010 (4)\n March (2)\n 23 (1)\n 12 (1)\n May (1)\n April (1)\n 17 (1)\n 2011 (3)\n February (2)\n January (1)\n" }; + searchIndex(taxoReader, searcher, false, exp, new String[][] { new String[] { "Date/2010/March", "Date/2010/April", "Date/2011" }}, null); + + // missing ancestors + exp = new String[] { "Date (-1)\n 2010 (4)\n March (2)\n May (1)\n April (1)\n 17 (1)\n 2011 (-1)\n January (1)\n 1 (1)\n" }; + searchIndex(taxoReader, searcher, false, exp, new String[][] { new String[] { "Date/2011/January/1", "Date/2010/April" }}, null); + + // missing ancestors (fill counts) + exp = new String[] { "Date (0)\n 2010 (4)\n March (2)\n May (1)\n April (1)\n 17 (1)\n 2011 (3)\n January (1)\n 1 (1)\n" }; + searchIndex(taxoReader, searcher, true, exp, new String[][] { new String[] { "Date/2011/January/1", "Date/2010/April" }}, null); + + // non-hierarchical dimension with both parent and child + exp = new String[] { "A (0)\n 1 (5)\n 4 (2)\n 3 (2)\n" }; + searchIndex(taxoReader, searcher, INFOSTREAM, exp, new String[][] { new String[] { "A", "A/3" }}, null); + + // non-hierarchical dimension with same request but different numResults + exp = new String[] { "A (0)\n 1 (5)\n 4 (2)\n 3 (2)\n 2 (2)\n" }; + searchIndex(taxoReader, searcher, INFOSTREAM, exp, new String[][] { new String[] { "A", "A" }}, new int[] { 2, 4 }); + + IOUtils.close(indexReader, taxoReader); + + IOUtils.close(indexDir, taxoDir); + } + +}