Index: lucene/facet/src/test/org/apache/lucene/facet/search/TestDrillSideways.java =================================================================== --- lucene/facet/src/test/org/apache/lucene/facet/search/TestDrillSideways.java (revision 0) +++ lucene/facet/src/test/org/apache/lucene/facet/search/TestDrillSideways.java (working copy) @@ -0,0 +1,713 @@ +package org.apache.lucene.facet.search; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.PrintStream; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.StringField; +import org.apache.lucene.document.TextField; +import org.apache.lucene.facet.FacetTestCase; +import org.apache.lucene.facet.FacetTestUtils; +import org.apache.lucene.facet.index.FacetFields; +import org.apache.lucene.facet.params.FacetSearchParams; +import org.apache.lucene.facet.search.CountFacetRequest; +import org.apache.lucene.facet.search.DrillSideways.DrillSidewaysResult; +import org.apache.lucene.facet.search.FacetResult; +import org.apache.lucene.facet.search.FacetResultNode; +import org.apache.lucene.facet.taxonomy.CategoryPath; +import org.apache.lucene.facet.taxonomy.TaxonomyReader; +import org.apache.lucene.facet.taxonomy.TaxonomyWriter; +import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyReader; +import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter; +import org.apache.lucene.facet.util.PrintTaxonomyStats; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.RandomIndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.BooleanClause.Occur; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.MatchAllDocsQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.Sort; +import org.apache.lucene.search.SortField; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.InfoStream; +import org.apache.lucene.util._TestUtil; + +public class TestDrillSideways extends FacetTestCase { + + private DirectoryTaxonomyWriter taxoWriter; + private RandomIndexWriter writer; + private FacetFields facetFields; + + private void add(String ... categoryPaths) throws IOException { + Document doc = new Document(); + List paths = new ArrayList(); + for(String categoryPath : categoryPaths) { + paths.add(new CategoryPath(categoryPath, '/')); + } + facetFields.addFields(doc, paths); + writer.addDocument(doc); + } + + // nocommit need random test w/ more than 2048 docs!! and + // mix in or and single value drilldown, as first dim or not + + public void testBasic() throws Exception { + Directory dir = newDirectory(); + Directory taxoDir = newDirectory(); + writer = new RandomIndexWriter(random(), dir); + + // Writes facet ords to a separate directory from the + // main index: + taxoWriter = new DirectoryTaxonomyWriter(taxoDir, IndexWriterConfig.OpenMode.CREATE); + + // Reused across documents, to add the necessary facet + // fields: + facetFields = new FacetFields(taxoWriter); + + add("Author/Bob", "Publish Date/2010/10/15"); + add("Author/Lisa", "Publish Date/2010/10/20"); + add("Author/Lisa", "Publish Date/2012/1/1"); + add("Author/Susan", "Publish Date/2012/1/7"); + add("Author/Frank", "Publish Date/1999/5/5"); + + // NRT open + IndexSearcher searcher = newSearcher(writer.getReader()); + writer.close(); + + //System.out.println("searcher=" + searcher); + + // NRT open + TaxonomyReader taxoReader = new DirectoryTaxonomyReader(taxoWriter); + taxoWriter.close(); + + // Count both "Publish Date" and "Author" dimensions, in + // drill-down: + FacetSearchParams fsp = new FacetSearchParams( + new CountFacetRequest(new CategoryPath("Publish Date"), 10), + new CountFacetRequest(new CategoryPath("Author"), 10)); + + // Simple case: drill-down on a single field; in this + // case the drill-sideways + drill-down counts == + // drill-down of just the query: + DrillDownQuery ddq = new DrillDownQuery(fsp.indexingParams, new MatchAllDocsQuery()); + ddq.add(new CategoryPath("Author", "Lisa")); + DrillSidewaysResult r = DrillSideways.search(searcher, taxoReader, ddq, 10, fsp); + + assertEquals(2, r.hits.totalHits); + assertEquals(2, r.facetResults.size()); + // Publish Date is only drill-down, and Lisa published + // one in 2012 and one in 2010: + assertEquals("Publish Date: 2012=1 2010=1", toString(r.facetResults.get(0))); + // Author is drill-sideways + drill-down: Lisa + // (drill-down) published twice, and Frank/Susan/Bob + // published once: + assertEquals("Author: Lisa=2 Frank=1 Susan=1 Bob=1", toString(r.facetResults.get(1))); + + // Another simple case: drill-down on on single fields + // but OR of two values + System.out.println("\nTEST: OR"); + ddq = new DrillDownQuery(fsp.indexingParams, new MatchAllDocsQuery()); + ddq.add(new CategoryPath("Author", "Lisa"), new CategoryPath("Author", "Bob")); + r = DrillSideways.search(searcher, taxoReader, ddq, 10, fsp); + assertEquals(3, r.hits.totalHits); + assertEquals(2, r.facetResults.size()); + // Publish Date is only drill-down: Lisa and Bob + // (drill-down) published twice in 2010 and once in 2012: + assertEquals("Publish Date: 2010=2 2012=1", toString(r.facetResults.get(0))); + // Author is drill-sideways + drill-down: Lisa + // (drill-down) published twice, and Frank/Susan/Bob + // published once: + assertEquals("Author: Lisa=2 Frank=1 Susan=1 Bob=1", toString(r.facetResults.get(1))); + + // More interesting case: drill-down on two fields + ddq = new DrillDownQuery(fsp.indexingParams, new MatchAllDocsQuery()); + ddq.add(new CategoryPath("Author", "Lisa")); + ddq.add(new CategoryPath("Publish Date", "2010")); + r = DrillSideways.search(searcher, taxoReader, ddq, 10, fsp); + assertEquals(1, r.hits.totalHits); + assertEquals(2, r.facetResults.size()); + // Publish Date is drill-sideways + drill-down: Lisa + // (drill-down) published once in 2010 and once in 2012: + assertEquals("Publish Date: 2012=1 2010=1", toString(r.facetResults.get(0))); + // Author is drill-sideways + drill-down: + // only Lisa & Bob published (once each) in 2010: + assertEquals("Author: Lisa=1 Bob=1", toString(r.facetResults.get(1))); + + // Even more interesting case: drill down on two fields, + // but one of them is OR + System.out.println("\nTEST: Lisa OR Bob"); + ddq = new DrillDownQuery(fsp.indexingParams, new MatchAllDocsQuery()); + + // Drill down on Lisa or Bob: + ddq.add(new CategoryPath("Author", "Lisa"), + new CategoryPath("Author", "Bob")); + ddq.add(new CategoryPath("Publish Date", "2010")); + r = DrillSideways.search(searcher, taxoReader, ddq, 10, fsp); + assertEquals(2, r.hits.totalHits); + assertEquals(2, r.facetResults.size()); + // Publish Date is both drill-sideways + drill-down: + // Lisa or Bob published twice in 2010 and once in 2012: + assertEquals("Publish Date: 2010=2 2012=1", toString(r.facetResults.get(0))); + // Author is drill-sideways + drill-down: + // only Lisa & Bob published (once each) in 2010: + assertEquals("Author: Lisa=1 Bob=1", toString(r.facetResults.get(1))); + + // Test drilling down on invalid field: + ddq = new DrillDownQuery(fsp.indexingParams, new MatchAllDocsQuery()); + ddq.add(new CategoryPath("Foobar", "Baz")); + fsp = new FacetSearchParams( + new CountFacetRequest(new CategoryPath("Publish Date"), 10), + new CountFacetRequest(new CategoryPath("Foobar"), 10)); + r = DrillSideways.search(searcher, taxoReader, ddq, 10, fsp); + assertEquals(0, r.hits.totalHits); + assertEquals(2, r.facetResults.size()); + // nocommit why isn't it null? should the ds dim be + // "empty FacetResult" too? + // assertNull(r.facetResults.get(0)); + assertEquals("Publish Date:", toString(r.facetResults.get(0))); + assertNull(r.facetResults.get(1)); + + // Test main query gets null scorer: + fsp = new FacetSearchParams( + new CountFacetRequest(new CategoryPath("Publish Date"), 10), + new CountFacetRequest(new CategoryPath("Author"), 10)); + ddq = new DrillDownQuery(fsp.indexingParams, new TermQuery(new Term("foobar", "baz"))); + ddq.add(new CategoryPath("Author", "Lisa")); + r = DrillSideways.search(searcher, taxoReader, ddq, 10, fsp); + + assertEquals(0, r.hits.totalHits); + assertEquals(2, r.facetResults.size()); + // nocommit when null vs empty? + assertEquals("Publish Date:", toString(r.facetResults.get(0))); + // nocommit when null vs empty? + assertEquals("Author:", toString(r.facetResults.get(1))); + + searcher.getIndexReader().close(); + taxoReader.close(); + dir.close(); + taxoDir.close(); + } + + public void testSometimesInvalidDrillDown() throws Exception { + Directory dir = newDirectory(); + Directory taxoDir = newDirectory(); + writer = new RandomIndexWriter(random(), dir); + + // Writes facet ords to a separate directory from the + // main index: + taxoWriter = new DirectoryTaxonomyWriter(taxoDir, IndexWriterConfig.OpenMode.CREATE); + + // Reused across documents, to add the necessary facet + // fields: + facetFields = new FacetFields(taxoWriter); + + add("Author/Bob", "Publish Date/2010/10/15"); + add("Author/Lisa", "Publish Date/2010/10/20"); + writer.commit(); + // 2nd segment has no Author: + add("Foobar/Lisa", "Publish Date/2012/1/1"); + + // NRT open + IndexSearcher searcher = newSearcher(writer.getReader()); + writer.close(); + + //System.out.println("searcher=" + searcher); + + // NRT open + TaxonomyReader taxoReader = new DirectoryTaxonomyReader(taxoWriter); + taxoWriter.close(); + + // Count both "Publish Date" and "Author" dimensions, in + // drill-down: + FacetSearchParams fsp = new FacetSearchParams( + new CountFacetRequest(new CategoryPath("Publish Date"), 10), + new CountFacetRequest(new CategoryPath("Author"), 10)); + + DrillDownQuery ddq = new DrillDownQuery(fsp.indexingParams, new MatchAllDocsQuery()); + ddq.add(new CategoryPath("Author", "Lisa")); + DrillSidewaysResult r = DrillSideways.search(searcher, taxoReader, ddq, 10, fsp); + + assertEquals(1, r.hits.totalHits); + assertEquals(2, r.facetResults.size()); + // Publish Date is only drill-down, and Lisa published + // one in 2012 and one in 2010: + assertEquals("Publish Date: 2010=1", toString(r.facetResults.get(0))); + // Author is drill-sideways + drill-down: Lisa + // (drill-down) published once, and Bob + // published once: + assertEquals("Author: Lisa=1 Bob=1", toString(r.facetResults.get(1))); + + searcher.getIndexReader().close(); + taxoReader.close(); + dir.close(); + taxoDir.close(); + } + + private static class Doc implements Comparable { + String id; + String contentToken; + int[] dims; + boolean deleted; + + @Override + public int compareTo(Doc other) { + return id.compareTo(other.id); + } + } + + private double aChance, bChance, cChance; + + private String randomContentToken(boolean isQuery) { + double d = random().nextDouble(); + if (isQuery) { + if (d < 0.33) { + return "a"; + } else if (d < 0.66) { + return "b"; + } else { + return "c"; + } + } else { + if (d <= aChance) { + return "a"; + } else if (d < aChance + bChance) { + return "b"; + } else { + return "c"; + } + } + } + + public void testRandom() throws Exception { + + while (aChance == 0.0) { + aChance = random().nextDouble(); + } + while (bChance == 0.0) { + bChance = random().nextDouble(); + } + while (cChance == 0.0) { + cChance = random().nextDouble(); + } + /* + aChance = .01; + bChance = 0.5; + cChance = 1.0; + */ + double sum = aChance + bChance + cChance; + aChance /= sum; + bChance /= sum; + cChance /= sum; + + int numDims = _TestUtil.nextInt(random(), 2, 10); + // nocommit + //int numDocs = atLeast(10000); + int numDocs = 10000; + if (VERBOSE) { + System.out.println("numDims=" + numDims + " numDocs=" + numDocs + " aChance=" + aChance + " bChance=" + bChance + " cChance=" + cChance); + } + String[][] dimValues = new String[numDims][]; + int valueCount = 2; + for(int dim=0;dim values = new HashSet(); + while (values.size() < valueCount) { + // nocommit + //String s = _TestUtil.randomRealisticUnicodeString(random()); + String s = _TestUtil.randomSimpleString(random()); + if (s.length() > 0) { + values.add(s); + } + } + dimValues[dim] = values.toArray(new String[values.size()]); + valueCount *= 2; + } + + List docs = new ArrayList(); + for(int i=0;i paths = new ArrayList(); + + if (VERBOSE) { + System.out.println(" doc id=" + rawDoc.id + " token=" + rawDoc.contentToken); + } + for(int dim=0;dim hits; + int[][] counts; + } + + private SimpleFacetResult slowDrillSidewaysSearch(IndexSearcher s, List docs, String contentToken, String[][] drillDowns, String[][] dimValues) throws Exception { + int numDims = dimValues.length; + + List hits = new ArrayList(); + Counters drillDownCounts = new Counters(dimValues); + Counters[] drillSidewaysCounts = new Counters[dimValues.length]; + for(int dim=0;dim actualValues = new HashMap(); + for(FacetResultNode childNode : fr.getFacetResultNode().subResults) { + actualValues.put(childNode.label.components[1], (int) childNode.value); + if (VERBOSE) { + System.out.println(" " + childNode.label.components[1] + ": " + (int) childNode.value); + } + } + + if (VERBOSE) { + System.out.println(" expected"); + } + + int setCount = 0; + for(int i=0;i drillDownDims = query.getDims(); + + // nocommit remove this limitation: it's silly? just do + // pure drill down in this case? + if (drillDownDims.isEmpty()) { + throw new IllegalArgumentException("there must be at least one drill-down"); + } + + BooleanQuery ddq = query.getBooleanQuery(); + BooleanClause[] clauses = ddq.getClauses(); + + // nocommit remove this limitation (allow pure browse case): + if (clauses.length == drillDownDims.size()) { + throw new IllegalArgumentException("baseQuery must not be null"); + } + + assert clauses.length == 1+drillDownDims.size(); + + for(int i=0;i 0"); + } + } + + // TODO: if query is already a BQ we could copy that and + // add clauses to it, instead of doing BQ inside BQ + // (should be more efficient)? Problem is this can + // affect scoring (coord) ... too bad we can't disable + // coord on a clause by clause basis: + Query baseQuery = clauses[0].getQuery(); + + Term[][] drillDownTerms = new Term[clauses.length-1][]; + for(int i=1;i 0; + if (fr.categoryPath.components[0].equals(dim)) { + if (drillSidewaysRequest != null) { + throw new IllegalArgumentException("multiple FacetRequests for drill-sideways dimension \"" + dim + "\""); + } + drillSidewaysRequest = fr; + } + } + if (drillSidewaysRequest == null) { + throw new IllegalArgumentException("could not find FacetRequest for drill-sideways dimension \"" + dim + "\""); + } + drillSidewaysCollectors[idx++] = FacetsCollector.create(new FacetSearchParams(fsp.indexingParams, drillSidewaysRequest), + searcher.getIndexReader(), taxoReader); + } + + DrillSidewaysQuery dsq = new DrillSidewaysQuery(baseQuery, drillDownCollector, drillSidewaysCollectors, drillDownTerms); + + searcher.search(dsq, hitCollector); + + List drillDownResults = drillDownCollector.getFacetResults(); + + List mergedResults = new ArrayList(); + for(int i=0;i 0; + Integer dimIndex = drillDownDims.get(fr.categoryPath.components[0]); + if (dimIndex == null) { + // Pure drill down dim (the current query didn't + // drill down on this dim): + mergedResults.add(drillDownResults.get(i)); + } else { + // Drill sideways dim: + List sidewaysResult = drillSidewaysCollectors[dimIndex.intValue()].getFacetResults(); + + // nocommit shouldn't this have been singleton + // list w/ null? + if (sidewaysResult.isEmpty()) { + mergedResults.add(null); + } else { + assert sidewaysResult.size() == 1: "size=" + sidewaysResult.size(); + mergedResults.add(sidewaysResult.get(0)); + } + } + } + + return new DrillSidewaysResult(mergedResults, null); + } + + public static DrillSidewaysResult search(IndexSearcher searcher, TaxonomyReader taxoReader, DrillDownQuery query, + Filter filter, int topN, Sort sort, boolean doDocScores, boolean doMaxScore, FacetSearchParams fsp) throws IOException { + if (filter != null) { + // nocommit todo + throw new UnsupportedOperationException(); + //query = new FilteredQuery(query, filter); + } + if (sort != null) { + final TopFieldCollector hitCollector = TopFieldCollector.create(sort, + Math.max(topN, searcher.getIndexReader().maxDoc()), + null, + true, + doDocScores, + doMaxScore, + true); + DrillSidewaysResult r = search(searcher, taxoReader, query, hitCollector, fsp); + r.hits = hitCollector.topDocs(); + return r; + } else { + return search(searcher, taxoReader, query, topN, fsp); + } + } + + public static DrillSidewaysResult search(IndexSearcher searcher, TaxonomyReader taxoReader, DrillDownQuery query, + int topN, FacetSearchParams fsp) throws IOException { + TopScoreDocCollector hitCollector = TopScoreDocCollector.create(Math.min(topN, searcher.getIndexReader().maxDoc()), null, true); + DrillSidewaysResult r = search(searcher, taxoReader, query, hitCollector, fsp); + r.hits = hitCollector.topDocs(); + return r; + } + + public static class DrillSidewaysResult { + /** Combined drill down & sideways results. */ + public final List facetResults; + public TopDocs hits; + + DrillSidewaysResult(List facetResults, TopDocs hits) { + this.facetResults = facetResults; + this.hits = hits; + } + } + + static class DrillSidewaysQuery extends Query { + final Query baseQuery; + final Collector drillDownCollector; + final Collector[] drillSidewaysCollectors; + final Term[][] drillDownTerms; + + DrillSidewaysQuery(Query baseQuery, Collector drillDownCollector, Collector[] drillSidewaysCollectors, Term[][] drillDownTerms) { + this.baseQuery = baseQuery; + this.drillDownCollector = drillDownCollector; + this.drillSidewaysCollectors = drillSidewaysCollectors; + this.drillDownTerms = drillDownTerms; + } + + @Override + public String toString(String field) { + return "DrillSidewaysQuery"; + } + + @Override + public Query rewrite(IndexReader reader) throws IOException { + Query newQuery = baseQuery; + while(true) { + Query rewrittenQuery = newQuery.rewrite(reader); + if (rewrittenQuery == newQuery) { + break; + } + newQuery = rewrittenQuery; + } + if (newQuery == baseQuery) { + return this; + } else { + return new DrillSidewaysQuery(newQuery, drillDownCollector, drillSidewaysCollectors, drillDownTerms); + } + } + + @Override + public Weight createWeight(IndexSearcher searcher) throws IOException { + final Weight baseWeight = baseQuery.createWeight(searcher); + + return new Weight() { + @Override + public Explanation explain(AtomicReaderContext context, int doc) throws IOException { + return baseWeight.explain(context, doc); + } + + @Override + public Query getQuery() { + return baseQuery; + } + + @Override + public float getValueForNormalization() throws IOException { + return baseWeight.getValueForNormalization(); + } + + @Override + public void normalize(float norm, float topLevelBoost) { + baseWeight.normalize(norm, topLevelBoost); + } + + @Override + public boolean scoresDocsOutOfOrder() { + // nocommit dangerous ... it's false now but if we + // change the DSS impl it can be true ... + return false; + } + + @Override + public Scorer scorer(AtomicReaderContext context, boolean scoreDocsInOrder, + boolean topScorer, Bits acceptDocs) throws IOException { + + DrillSidewaysScorer.DocsEnumsAndFreq[] dims = new DrillSidewaysScorer.DocsEnumsAndFreq[drillDownTerms.length]; + TermsEnum termsEnum = null; + String lastField = null; + int nullCount = 0; + for(int dim=0;dim 1) { + return null; + } + + // Sort drill-downs by most restrictive first: + Arrays.sort(dims); + + // TODO: it could be better if we take acceptDocs + // into account? + Scorer baseScorer = baseWeight.scorer(context, scoreDocsInOrder, false, acceptDocs); + + if (baseScorer == null) { + return null; + } + + return new DrillSidewaysScorer(this, context, + baseScorer, + drillDownCollector, dims); + } + }; + } + + @Override + public int hashCode() { + throw new UnsupportedOperationException(); + } + + @Override + public boolean equals(Object obj) { + throw new UnsupportedOperationException(); + } + } +} + Property changes on: lucene/facet/src/java/org/apache/lucene/facet/search/DrillSideways.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/facet/src/java/org/apache/lucene/facet/search/DrillDownQuery.java =================================================================== --- lucene/facet/src/java/org/apache/lucene/facet/search/DrillDownQuery.java (revision 1445961) +++ lucene/facet/src/java/org/apache/lucene/facet/search/DrillDownQuery.java (working copy) @@ -18,8 +18,8 @@ */ import java.io.IOException; -import java.util.HashSet; -import java.util.Set; +import java.util.LinkedHashMap; +import java.util.Map; import org.apache.lucene.facet.params.CategoryListParams; import org.apache.lucene.facet.params.FacetIndexingParams; @@ -57,21 +57,20 @@ } private final BooleanQuery query; - private final Set drillDownDims = new HashSet(); - + private final Map drillDownDims = new LinkedHashMap(); private final FacetIndexingParams fip; /* Used by clone() */ - private DrillDownQuery(FacetIndexingParams fip, BooleanQuery query, Set drillDownDims) { + DrillDownQuery(FacetIndexingParams fip, BooleanQuery query, Map drillDownDims) { this.fip = fip; this.query = query.clone(); - this.drillDownDims.addAll(drillDownDims); + this.drillDownDims.putAll(drillDownDims); } /** - * Creates a new {@link DrillDownQuery} without a base query, which means that - * you intend to perfor a pure browsing query (equivalent to using - * {@link MatchAllDocsQuery} as base. + * Creates a new {@link DrillDownQuery} without a base query, + * to perform a pure browsing query (equivalent to using + * {@link MatchAllDocsQuery} as base). */ public DrillDownQuery(FacetIndexingParams fip) { this(fip, null); @@ -97,14 +96,14 @@ */ public void add(CategoryPath... paths) { Query q; + if (paths[0].length == 0) { + throw new IllegalArgumentException("all CategoryPaths must have length > 0"); + } String dim = paths[0].components[0]; - if (drillDownDims.contains(dim)) { + if (drillDownDims.containsKey(dim)) { throw new IllegalArgumentException("dimension '" + dim + "' was already added"); } if (paths.length == 1) { - if (paths[0].length == 0) { - throw new IllegalArgumentException("all CategoryPaths must have length > 0"); - } q = new TermQuery(term(fip, paths[0])); } else { BooleanQuery bq = new BooleanQuery(true); // disable coord @@ -120,7 +119,7 @@ } q = bq; } - drillDownDims.add(dim); + drillDownDims.put(dim, drillDownDims.size()); final ConstantScoreQuery drillDownQuery = new ConstantScoreQuery(q); drillDownQuery.setBoost(0.0f); @@ -162,5 +161,12 @@ public String toString(String field) { return query.toString(field); } - + + BooleanQuery getBooleanQuery() { + return query; + } + + Map getDims() { + return drillDownDims; + } } Index: lucene/facet/src/java/org/apache/lucene/facet/search/DrillSidewaysScorer.java =================================================================== --- lucene/facet/src/java/org/apache/lucene/facet/search/DrillSidewaysScorer.java (revision 0) +++ lucene/facet/src/java/org/apache/lucene/facet/search/DrillSidewaysScorer.java (working copy) @@ -0,0 +1,653 @@ +package org.apache.lucene.facet.search; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Arrays; +import java.util.Collection; +import java.util.Collections; + +import org.apache.lucene.index.AtomicReaderContext; +import org.apache.lucene.index.DocsEnum; +import org.apache.lucene.index.StoredDocument; +import org.apache.lucene.search.Collector; +import org.apache.lucene.search.Scorer; +import org.apache.lucene.search.Weight; + +class DrillSidewaysScorer extends Scorer { + + private static boolean DEBUG = false; + + // nocommit nuke DrillSidewaysCollector, and do all facet + // counting in here + + private final Collector drillDownCollector; + + private final DocsEnumsAndFreq[] dims; + + // DrillDown DocsEnums: + private final Scorer baseScorer; + + private final AtomicReaderContext context; + + private static final int CHUNK = 2048; + private static final int MASK = CHUNK-1; + + private int collectDocID = -1; + private float collectScore; + + DrillSidewaysScorer(Weight w, AtomicReaderContext context, Scorer baseScorer, Collector drillDownCollector, + DocsEnumsAndFreq[] dims) { + super(w); + this.dims = dims; + this.context = context; + this.baseScorer = baseScorer; + this.drillDownCollector = drillDownCollector; + } + + @Override + public void score(Collector collector) throws IOException { + if (DEBUG) { + System.out.println("\nscore: reader=" + context.reader()); + } + collector.setScorer(this); + drillDownCollector.setScorer(this); + drillDownCollector.setNextReader(context); + for(DocsEnumsAndFreq dim : dims) { + dim.sidewaysCollector.setScorer(this); + dim.sidewaysCollector.setNextReader(context); + } + + // nocommit if we ever allow null baseScorer ... it will + // mean we DO score docs out of order ... hmm, or if we + // change up the order of the conjuntions below + assert baseScorer != null; + + // Position all scorers to their first matching doc: + int baseDocID = baseScorer.nextDoc(); + + for(DocsEnumsAndFreq dim : dims) { + for(DocsEnum docsEnum : dim.docsEnums) { + if (docsEnum != null) { + docsEnum.nextDoc(); + } + } + } + + final int numDims = dims.length; + + DocsEnum[][] docsEnums = new DocsEnum[numDims][]; + Collector[] sidewaysCollectors = new Collector[numDims]; + for(int dim=0;dim 1 && dims[1].freq < estBaseHitCount / 10)) { + //System.out.println("s2"); + doDrillDownAdvanceScoring(collector, docsEnums, sidewaysCollectors); + } else { + //System.out.println("s3"); + doUnionScoring(collector, docsEnums, sidewaysCollectors); + } + } + + /** Used when drill downs are highly constraining vs + * baseQuery. */ + private void doDrillDownAdvanceScoring(Collector collector, DocsEnum[][] docsEnums, Collector[] sidewaysCollectors) throws IOException { + final int maxDoc = context.reader().maxDoc(); + final int numDims = dims.length; + + if (DEBUG) { + System.out.println(" doDrillDownAdvanceScoring"); + } + + // nocommit maybe a class like BS? + int[] filledSlots = new int[CHUNK]; + int[] docIDs = new int[CHUNK]; + float[] scores = new float[CHUNK]; + int[] missingDims = new int[CHUNK]; + int[] counts = new int[CHUNK]; + + docIDs[0] = -1; + int nextChunkStart = CHUNK; + + while (true) { + if (DEBUG) { + System.out.println("\ncycle nextChunkStart=" + nextChunkStart + " docIds[0]=" + docIDs[0]); + } + int filledCount = 0; + + // First dim: + if (DEBUG) { + System.out.println(" dim0"); + } + for(DocsEnum docsEnum : docsEnums[0]) { + if (docsEnum == null) { + continue; + } + int docID = docsEnum.docID(); + while (docID < nextChunkStart) { + int slot = docID & MASK; + + if (docIDs[slot] != docID) { + // Mark slot as valid: + if (DEBUG) { + System.out.println(" set docID=" + docID + " id=" + context.reader().document(docID).get("id")); + } + docIDs[slot] = docID; + filledSlots[filledCount++] = slot; + missingDims[slot] = 1; + counts[slot] = 1; + } + + docID = docsEnum.nextDoc(); + } + } + + // Second dim: + if (DEBUG) { + System.out.println(" dim1"); + } + for(DocsEnum docsEnum : docsEnums[1]) { + if (docsEnum == null) { + continue; + } + int docID = docsEnum.docID(); + while (docID < nextChunkStart) { + int slot = docID & MASK; + + if (docIDs[slot] != docID) { + // Mark slot as valid: + if (DEBUG) { + System.out.println(" set docID=" + docID + " missingDim=0 id=" + context.reader().document(docID).get("id")); + } + docIDs[slot] = docID; + filledSlots[filledCount++] = slot; + missingDims[slot] = 0; + counts[slot] = 1; + } else { + // nocommit this is wonky + if (missingDims[slot] >= 1 && counts[slot] == 1) { + missingDims[slot] = 2; + // nocommit silly: + counts[slot] = 2; + if (DEBUG) { + System.out.println(" set docID=" + docID + " missingDim=2 id=" + context.reader().document(docID).get("id")); + } + } else { + counts[slot] = 1; + if (DEBUG) { + System.out.println(" set docID=" + docID + " missingDim=" + missingDims[slot] + " id=" + context.reader().document(docID).get("id")); + } + } + } + + docID = docsEnum.nextDoc(); + } + } + + // After this we can "upgrade" to conjunction, because + // any doc not seen by either dim 0 or dim 1 cannot be + // a hit or a near miss: + + if (DEBUG) { + System.out.println(" sort " + filledCount + " slots"); + } + // nocommit PQ? daat? + Arrays.sort(filledSlots, 0, filledCount); + + if (DEBUG) { + System.out.println(" baseScorer"); + } + + // Fold in baseScorer: + int newFilledCount = 0; + for(int i=0;i= dim && counts[slot] == allMatchCount) { + if (DEBUG) { + System.out.println(" set docID=" + docID + " count=" + (dim+2)); + } + missingDims[slot] = dim+1; + counts[slot] = dim+2; + } else { + if (DEBUG) { + System.out.println(" set docID=" + docID + " missing count=" + (dim+1)); + } + counts[slot] = dim+1; + } + } + // nocommit sometimes use advance? + docID = docsEnum.nextDoc(); + } + } + + int pruneCount = dim; + + // nocommit wny bother pruning...? can't i in the + // end just ignore the slots w/ count < N-1? + + // nocommit we could save prune on final loop and + // just do it in collect loop + // Prune: + newFilledCount = 0; + for(int i=0;i dim) { + filledSlots[newFilledCount++] = slot; + } else { + assert count == dim: "count=" + count + " dim=" + dim + " slot=" + slot + " docID=" + docIDs[slot]; + if (DEBUG) { + System.out.println(" prune docID=" + docIDs[slot]); + } + docIDs[slot] = -1; + } + } + filledCount = newFilledCount; + if (newFilledCount == 0) { + break; + } + } + + // Collect: + if (DEBUG) { + System.out.println(" now collect: " + filledCount + " hits"); + } + for(int i=0;i= maxDoc) { + break; + } + + nextChunkStart += CHUNK; + } + } + + // nocommit can/should we count the near miss count? + + /** Used when base query is highly constraining vs the + * drilldowns; in this case we just .next() on base and + * .advance() on the dims. */ + private void doBaseAdvanceScoring(Collector collector, DocsEnum[][] docsEnums, Collector[] sidewaysCollectors) throws IOException { + if (DEBUG) { + System.out.println(" doBaseAdvanceScoring"); + } + int docID = baseScorer.docID(); + + final int numDims = dims.length; + + nextDoc: while (docID != NO_MORE_DOCS) { + int failedDim = -1; + for(int dim=0;dim= dim && counts[slot] == allMatchCount) { + if (DEBUG) { + System.out.println(" set docID=" + docID + " count=" + (dim+2)); + } + missingDims[slot] = dim+1; + counts[slot] = dim+2; + } else { + if (DEBUG) { + System.out.println(" set docID=" + docID + " missing count=" + (dim+1)); + } + counts[slot] = dim+1; + } + } + // nocommit sometimes use advance? + docID = docsEnum.nextDoc(); + } + } + + int pruneCount = dim; + + // nocommit wny bother pruning...? can't i in the + // end just ignore the slots w/ count < N-1? + + // nocommit we could save prune on final loop and + // just do it in collect loop + // Prune: + int newFilledCount = 0; + for(int i=0;i dim) { + filledSlots[newFilledCount++] = slot; + } else { + assert count == dim; + if (DEBUG) { + System.out.println(" prune docID=" + docIDs[slot]); + } + docIDs[slot] = -1; + } + } + filledCount = newFilledCount; + if (newFilledCount == 0) { + break; + } + } + + // Collect: + if (DEBUG) { + System.out.println(" now collect: " + filledCount + " hits"); + } + for(int i=0;i= maxDoc) { + break; + } + + nextChunkStart += CHUNK; + } + } + + @Override + public int docID() { + return collectDocID; + } + + @Override + public float score() { + return collectScore; + } + + @Override + public int freq() { + return 1+dims.length; + } + + @Override + public int nextDoc() { + throw new UnsupportedOperationException(); + } + + @Override + public int advance(int target) { + throw new UnsupportedOperationException(); + } + + @Override + public Collection getChildren() { + // nocommit fixme + return Collections.emptyList(); + } + + static class DocsEnumsAndFreq implements Comparable { + DocsEnum[] docsEnums; + int freq; + Collector sidewaysCollector; + String dim; + + @Override + public int compareTo(DocsEnumsAndFreq other) { + return freq - other.freq; + } + } +} Property changes on: lucene/facet/src/java/org/apache/lucene/facet/search/DrillSidewaysScorer.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/core/src/java/org/apache/lucene/util/FixedBitSet.java =================================================================== --- lucene/core/src/java/org/apache/lucene/util/FixedBitSet.java (revision 1445961) +++ lucene/core/src/java/org/apache/lucene/util/FixedBitSet.java (working copy) @@ -118,7 +118,7 @@ } public void set(int index) { - assert index >= 0 && index < numBits; + assert index >= 0 && index < numBits: "index=" + index + " numBits=" + numBits; int wordNum = index >> 6; // div 64 int bit = index & 0x3f; // mod 64 long bitmask = 1L << bit; Index: lucene/core/src/java/org/apache/lucene/search/FilteredQuery.java =================================================================== --- lucene/core/src/java/org/apache/lucene/search/FilteredQuery.java (revision 1445961) +++ lucene/core/src/java/org/apache/lucene/search/FilteredQuery.java (working copy) @@ -516,7 +516,7 @@ } final Bits filterAcceptDocs = docIdSet.bits(); - // force if RA is requested + // force if RA is requested final boolean useRandomAccess = (filterAcceptDocs != null && (useRandomAccess(filterAcceptDocs, firstFilterDoc))); if (useRandomAccess) { // if we are using random access, we return the inner scorer, just with other acceptDocs