Index: lucene/CHANGES.txt =================================================================== --- lucene/CHANGES.txt (revision 1486871) +++ lucene/CHANGES.txt (working copy) @@ -86,6 +86,10 @@ DictionaryCompoundWordTokenFilter and HyphenationCompoundWordTokenFilter don't update offsets anymore. (Adrien Grand) +* LUCENE-5015: SamplingAccumulator no longer corrects the counts of the sampled + categories. You should set TakmiSampleFixer on SamplingParams if required (but + notice that this means slower search). (Gilad Barkai via Shai Erera) + Bug Fixes * LUCENE-4997: Internal test framework's tests are sensitive to previous Index: lucene/facet/src/java/org/apache/lucene/facet/sampling/SampleFixer.java =================================================================== --- lucene/facet/src/java/org/apache/lucene/facet/sampling/SampleFixer.java (revision 1486871) +++ lucene/facet/src/java/org/apache/lucene/facet/sampling/SampleFixer.java (working copy) @@ -3,6 +3,7 @@ import java.io.IOException; import org.apache.lucene.facet.search.FacetResult; +import org.apache.lucene.facet.search.FacetResultNode; import org.apache.lucene.facet.search.ScoredDocIDs; /* @@ -23,22 +24,50 @@ */ /** - * Fixer of sample facet accumulation results + * Fixer of sample facet accumulation results. * * @lucene.experimental */ -public interface SampleFixer { +public abstract class SampleFixer { /** * Alter the input result, fixing it to account for the sampling. This - * implementation can compute accurate or estimated counts for the sampled facets. - * For example, a faster correction could just multiply by a compensating factor. + * implementation can compute accurate or estimated counts for the sampled + * facets. For example, a faster correction could just multiply by a + * compensating factor. * * @param origDocIds * full set of matching documents. * @param fres * sample result to be fixed. - * @throws IOException If there is a low-level I/O error. + * @throws IOException + * If there is a low-level I/O error. */ - public void fixResult(ScoredDocIDs origDocIds, FacetResult fres) throws IOException; + public void fixResult(ScoredDocIDs origDocIds, FacetResult fres, double samplingRatio) throws IOException { + FacetResultNode topRes = fres.getFacetResultNode(); + fixResultNode(topRes, origDocIds, samplingRatio); + } + + /** + * Fix result node count, and, recursively, fix all its children + * + * @param facetResNode + * result node to be fixed + * @param docIds + * docids in effect + * @throws IOException + * If there is a low-level I/O error. + */ + protected void fixResultNode(FacetResultNode facetResNode, ScoredDocIDs docIds, double samplingRatio) + throws IOException { + singleNodeFix(facetResNode, docIds, samplingRatio); + for (FacetResultNode frn : facetResNode.subResults) { + fixResultNode(frn, docIds, samplingRatio); + } + } + + /** Fix the given node's value. */ + protected abstract void singleNodeFix(FacetResultNode facetResNode, ScoredDocIDs docIds, double samplingRatio) + throws IOException; + } \ No newline at end of file Index: lucene/facet/src/java/org/apache/lucene/facet/sampling/Sampler.java =================================================================== --- lucene/facet/src/java/org/apache/lucene/facet/sampling/Sampler.java (revision 1486871) +++ lucene/facet/src/java/org/apache/lucene/facet/sampling/Sampler.java (working copy) @@ -12,7 +12,6 @@ import org.apache.lucene.facet.search.FacetResultNode; import org.apache.lucene.facet.search.ScoredDocIDs; import org.apache.lucene.facet.taxonomy.TaxonomyReader; -import org.apache.lucene.index.IndexReader; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -111,16 +110,6 @@ throws IOException; /** - * Get a fixer of sample facet accumulation results. Default implementation - * returns a TakmiSampleFixer which is adequate only for - * counting. For any other accumulator, provide a different fixer. - */ - public SampleFixer getSampleFixer(IndexReader indexReader, TaxonomyReader taxonomyReader, - FacetSearchParams searchParams) { - return new TakmiSampleFixer(indexReader, taxonomyReader, searchParams); - } - - /** * Result of sample computation */ public final static class SampleResult { Index: lucene/facet/src/java/org/apache/lucene/facet/sampling/SamplingAccumulator.java =================================================================== --- lucene/facet/src/java/org/apache/lucene/facet/sampling/SamplingAccumulator.java (revision 1486871) +++ lucene/facet/src/java/org/apache/lucene/facet/sampling/SamplingAccumulator.java (working copy) @@ -79,7 +79,11 @@ public List accumulate(ScoredDocIDs docids) throws IOException { // Replacing the original searchParams with the over-sampled FacetSearchParams original = searchParams; - searchParams = sampler.overSampledSearchParams(original); + SampleFixer samplerFixer = sampler.samplingParams.getSampleFixer(); + final boolean shouldOversample = sampler.samplingParams.shouldOverSample(); + if (shouldOversample) { + searchParams = sampler.overSampledSearchParams(original); + } List sampleRes = super.accumulate(docids); @@ -87,14 +91,18 @@ for (FacetResult fres : sampleRes) { // for sure fres is not null because this is guaranteed by the delegee. PartitionsFacetResultsHandler frh = createFacetResultsHandler(fres.getFacetRequest()); - // fix the result of current request - sampler.getSampleFixer(indexReader, taxonomyReader, searchParams).fixResult(docids, fres); + if (samplerFixer != null) { + // fix the result of current request + samplerFixer.fixResult(docids, fres, samplingRatio); + + fres = frh.rearrangeFacetResult(fres); // let delegee's handler do any arranging it needs to + + if (shouldOversample) { + // Using the sampler to trim the extra (over-sampled) results + fres = sampler.trimResult(fres); + } + } - fres = frh.rearrangeFacetResult(fres); // let delegee's handler do any arranging it needs to - - // Using the sampler to trim the extra (over-sampled) results - fres = sampler.trimResult(fres); - // final labeling if allowed (because labeling is a costly operation) frh.labelResult(fres); fixedRes.add(fres); // add to final results Index: lucene/facet/src/java/org/apache/lucene/facet/sampling/SamplingParams.java =================================================================== --- lucene/facet/src/java/org/apache/lucene/facet/sampling/SamplingParams.java (revision 1486871) +++ lucene/facet/src/java/org/apache/lucene/facet/sampling/SamplingParams.java (working copy) @@ -28,7 +28,7 @@ * Default factor by which more results are requested over the sample set. * @see SamplingParams#getOversampleFactor() */ - public static final double DEFAULT_OVERSAMPLE_FACTOR = 2d; + public static final double DEFAULT_OVERSAMPLE_FACTOR = 1d; /** * Default ratio between size of sample to original size of document set. @@ -59,6 +59,8 @@ private double sampleRatio = DEFAULT_SAMPLE_RATIO; private int samplingThreshold = DEFAULT_SAMPLING_THRESHOLD; private double oversampleFactor = DEFAULT_OVERSAMPLE_FACTOR; + + private SampleFixer sampleFixer = null; /** * Return the maxSampleSize. @@ -166,4 +168,29 @@ this.oversampleFactor = oversampleFactor; } -} \ No newline at end of file + /** + * @return {@link SampleFixer} to be used while fixing the sampled results, if + * null no fixing will be performed + */ + public SampleFixer getSampleFixer() { + return sampleFixer; + } + + /** + * Set a {@link SampleFixer} to be used while fixing the sampled results. + * {@code null} means no fixing will be performed + */ + public void setSampleFixer(SampleFixer sampleFixer) { + this.sampleFixer = sampleFixer; + } + + /** + * Returns whether over-sampling should be done. By default returns + * {@code true} when {@link #getSampleFixer()} is not {@code null} and + * {@link #getOversampleFactor()} > 1, {@code false} otherwise. + */ + public boolean shouldOverSample() { + return sampleFixer != null && oversampleFactor > 1d; + } + +} Index: lucene/facet/src/java/org/apache/lucene/facet/sampling/SamplingWrapper.java =================================================================== --- lucene/facet/src/java/org/apache/lucene/facet/sampling/SamplingWrapper.java (revision 1486871) +++ lucene/facet/src/java/org/apache/lucene/facet/sampling/SamplingWrapper.java (working copy) @@ -52,29 +52,41 @@ public List accumulate(ScoredDocIDs docids) throws IOException { // Replacing the original searchParams with the over-sampled (and without statistics-compute) FacetSearchParams original = delegee.searchParams; - delegee.searchParams = sampler.overSampledSearchParams(original); + boolean shouldOversample = sampler.samplingParams.shouldOverSample(); + + if (shouldOversample) { + delegee.searchParams = sampler.overSampledSearchParams(original); + } SampleResult sampleSet = sampler.getSampleSet(docids); List sampleRes = delegee.accumulate(sampleSet.docids); List fixedRes = new ArrayList(); + SampleFixer sampleFixer = sampler.samplingParams.getSampleFixer(); + for (FacetResult fres : sampleRes) { // for sure fres is not null because this is guaranteed by the delegee. PartitionsFacetResultsHandler frh = createFacetResultsHandler(fres.getFacetRequest()); - // fix the result of current request - sampler.getSampleFixer(indexReader, taxonomyReader, searchParams).fixResult(docids, fres); - fres = frh.rearrangeFacetResult(fres); // let delegee's handler do any + if (sampleFixer != null) { + // fix the result of current request + sampleFixer.fixResult(docids, fres, sampleSet.actualSampleRatio); + fres = frh.rearrangeFacetResult(fres); // let delegee's handler do any + } - // Using the sampler to trim the extra (over-sampled) results - fres = sampler.trimResult(fres); + if (shouldOversample) { + // Using the sampler to trim the extra (over-sampled) results + fres = sampler.trimResult(fres); + } // final labeling if allowed (because labeling is a costly operation) frh.labelResult(fres); fixedRes.add(fres); // add to final results } - delegee.searchParams = original; // Back to original params + if (shouldOversample) { + delegee.searchParams = original; // Back to original params + } return fixedRes; } Index: lucene/facet/src/java/org/apache/lucene/facet/sampling/TakmiSampleFixer.java =================================================================== --- lucene/facet/src/java/org/apache/lucene/facet/sampling/TakmiSampleFixer.java (revision 1486871) +++ lucene/facet/src/java/org/apache/lucene/facet/sampling/TakmiSampleFixer.java (working copy) @@ -2,21 +2,19 @@ import java.io.IOException; -import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.MultiFields; -import org.apache.lucene.index.Term; -import org.apache.lucene.index.DocsEnum; -import org.apache.lucene.search.DocIdSetIterator; -import org.apache.lucene.util.Bits; - import org.apache.lucene.facet.params.FacetSearchParams; import org.apache.lucene.facet.search.DrillDownQuery; -import org.apache.lucene.facet.search.FacetResult; import org.apache.lucene.facet.search.FacetResultNode; import org.apache.lucene.facet.search.ScoredDocIDs; import org.apache.lucene.facet.search.ScoredDocIDsIterator; import org.apache.lucene.facet.taxonomy.CategoryPath; import org.apache.lucene.facet.taxonomy.TaxonomyReader; +import org.apache.lucene.index.DocsEnum; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.MultiFields; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.util.Bits; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -36,16 +34,21 @@ */ /** - * Fix sampling results by counting the intersection between two lists: a - * TermDocs (list of documents in a certain category) and a DocIdSetIterator - * (list of documents matching the query). + * Fix sampling results by correct results, by counting the intersection between + * two lists: a TermDocs (list of documents in a certain category) and a + * DocIdSetIterator (list of documents matching the query). + *

+ * This fixer is suitable for scenarios which prioritize accuracy over + * performance. + *

+ * Note: for statistically more accurate top-k selection, set + * {@link SamplingParams#setOversampleFactor(double) oversampleFactor} to at + * least 2, so that the top-k categories would have better chance of showing up + * in the sampled top-cK results (see {@link SamplingParams#getOversampleFactor} * - * * @lucene.experimental */ -// TODO (Facet): implement also an estimated fixing by ratio (taking into -// account "translation" of counts!) -class TakmiSampleFixer implements SampleFixer { +public class TakmiSampleFixer extends SampleFixer { private TaxonomyReader taxonomyReader; private IndexReader indexReader; @@ -59,29 +62,11 @@ } @Override - public void fixResult(ScoredDocIDs origDocIds, FacetResult fres) - throws IOException { - FacetResultNode topRes = fres.getFacetResultNode(); - fixResultNode(topRes, origDocIds); + public void singleNodeFix(FacetResultNode facetResNode, ScoredDocIDs docIds, double samplingRatio) throws IOException { + recount(facetResNode, docIds); } /** - * Fix result node count, and, recursively, fix all its children - * - * @param facetResNode - * result node to be fixed - * @param docIds - * docids in effect - * @throws IOException If there is a low-level I/O error. - */ - private void fixResultNode(FacetResultNode facetResNode, ScoredDocIDs docIds) throws IOException { - recount(facetResNode, docIds); - for (FacetResultNode frn : facetResNode.subResults) { - fixResultNode(frn, docIds); - } - } - - /** * Internal utility: recount for a facet result node * * @param fresNode @@ -179,4 +164,5 @@ } return false; // exhausted } + } \ No newline at end of file Index: lucene/facet/src/java/org/apache/lucene/facet/search/StandardFacetsAccumulator.java =================================================================== --- lucene/facet/src/java/org/apache/lucene/facet/search/StandardFacetsAccumulator.java (revision 1486871) +++ lucene/facet/src/java/org/apache/lucene/facet/search/StandardFacetsAccumulator.java (working copy) @@ -94,7 +94,7 @@ private Object accumulateGuard; - private double complementThreshold; + private double complementThreshold = DEFAULT_COMPLEMENT_THRESHOLD; public StandardFacetsAccumulator(FacetSearchParams searchParams, IndexReader indexReader, TaxonomyReader taxonomyReader) { Index: lucene/facet/src/test/org/apache/lucene/facet/sampling/BaseSampleTestTopK.java =================================================================== --- lucene/facet/src/test/org/apache/lucene/facet/sampling/BaseSampleTestTopK.java (revision 1486871) +++ lucene/facet/src/test/org/apache/lucene/facet/sampling/BaseSampleTestTopK.java (working copy) @@ -94,7 +94,7 @@ for (int nTrial = 0; nTrial < RETRIES; nTrial++) { try { // complement with sampling! - final Sampler sampler = createSampler(nTrial, useRandomSampler); + final Sampler sampler = createSampler(nTrial, useRandomSampler, samplingSearchParams); assertSampling(expectedResults, q, sampler, samplingSearchParams, false); assertSampling(expectedResults, q, sampler, samplingSearchParams, true); @@ -128,14 +128,20 @@ return FacetsCollector.create(sfa); } - private Sampler createSampler(int nTrial, boolean useRandomSampler) { + private Sampler createSampler(int nTrial, boolean useRandomSampler, FacetSearchParams sParams) { SamplingParams samplingParams = new SamplingParams(); + /* + * Set sampling to Exact fixing with TakmiSampleFixer as it is not easy to + * validate results with amortized results. + */ + samplingParams.setSampleFixer(new TakmiSampleFixer(indexReader, taxoReader, sParams)); + final double retryFactor = Math.pow(1.01, nTrial); + samplingParams.setOversampleFactor(5.0 * retryFactor); // Oversampling samplingParams.setSampleRatio(0.8 * retryFactor); samplingParams.setMinSampleSize((int) (100 * retryFactor)); samplingParams.setMaxSampleSize((int) (10000 * retryFactor)); - samplingParams.setOversampleFactor(5.0 * retryFactor); samplingParams.setSamplingThreshold(11000); //force sampling Sampler sampler = useRandomSampler ? Index: lucene/facet/src/test/org/apache/lucene/facet/sampling/SamplerTest.java =================================================================== --- lucene/facet/src/test/org/apache/lucene/facet/sampling/SamplerTest.java (revision 0) +++ lucene/facet/src/test/org/apache/lucene/facet/sampling/SamplerTest.java (working copy) @@ -0,0 +1,111 @@ +package org.apache.lucene.facet.sampling; + +import java.util.ArrayList; +import java.util.List; + +import org.apache.lucene.facet.FacetTestBase; +import org.apache.lucene.facet.params.FacetIndexingParams; +import org.apache.lucene.facet.params.FacetSearchParams; +import org.apache.lucene.facet.search.CountFacetRequest; +import org.apache.lucene.facet.search.FacetResultNode; +import org.apache.lucene.facet.search.FacetsCollector; +import org.apache.lucene.facet.search.StandardFacetsAccumulator; +import org.apache.lucene.facet.taxonomy.CategoryPath; +import org.apache.lucene.search.MatchAllDocsQuery; +import org.junit.After; +import org.junit.Before; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public class SamplerTest extends FacetTestBase { + + private FacetIndexingParams fip; + + @Override + @Before + public void setUp() throws Exception { + super.setUp(); + fip = getFacetIndexingParams(Integer.MAX_VALUE); + initIndex(fip); + } + + @Override + protected int numDocsToIndex() { + return 100; + } + + @Override + protected List getCategories(final int doc) { + return new ArrayList() { + { + add(new CategoryPath("root", "a", Integer.toString(doc % 10))); + } + }; + } + + @Override + protected String getContent(int doc) { + return ""; + } + + @Override + @After + public void tearDown() throws Exception { + closeAll(); + super.tearDown(); + } + + public void testDefaultFixer() throws Exception { + RandomSampler randomSampler = new RandomSampler(); + SampleFixer fixer = randomSampler.samplingParams.getSampleFixer(); + assertEquals(null, fixer); + } + + public void testCustomFixer() throws Exception { + SamplingParams sp = new SamplingParams(); + sp.setSampleFixer(new TakmiSampleFixer(null, null, null)); + assertEquals(TakmiSampleFixer.class, sp.getSampleFixer().getClass()); + } + + public void testNoFixing() throws Exception { + SamplingParams sp = new SamplingParams(); + sp.setMaxSampleSize(10); + sp.setMinSampleSize(5); + sp.setSampleRatio(0.01d); + sp.setSamplingThreshold(50); + sp.setOversampleFactor(5d); + + assertNull("Fixer should be null as the test is for no-fixing", + sp.getSampleFixer()); + FacetSearchParams fsp = new FacetSearchParams(fip, new CountFacetRequest( + new CategoryPath("root", "a"), 1)); + SamplingAccumulator accumulator = new SamplingAccumulator( + new RandomSampler(sp, random()), fsp, indexReader, taxoReader); + + // Make sure no complements are in action + accumulator + .setComplementThreshold(StandardFacetsAccumulator.DISABLE_COMPLEMENT); + + FacetsCollector fc = FacetsCollector.create(accumulator); + + searcher.search(new MatchAllDocsQuery(), fc); + FacetResultNode node = fc.getFacetResults().get(0).getFacetResultNode(); + + assertTrue(node.value < numDocsToIndex()); + } +} Property changes on: lucene/facet/src/test/org/apache/lucene/facet/sampling/SamplerTest.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property