TakmiSampleFixer which is adequate only for
- * counting. For any other accumulator, provide a different fixer.
- */
- public SampleFixer getSampleFixer(IndexReader indexReader, TaxonomyReader taxonomyReader,
- FacetSearchParams searchParams) {
- return new TakmiSampleFixer(indexReader, taxonomyReader, searchParams);
- }
-
- /**
* Result of sample computation
*/
public final static class SampleResult {
Index: lucene/facet/src/java/org/apache/lucene/facet/sampling/SamplingAccumulator.java
===================================================================
--- lucene/facet/src/java/org/apache/lucene/facet/sampling/SamplingAccumulator.java (revision 1486871)
+++ lucene/facet/src/java/org/apache/lucene/facet/sampling/SamplingAccumulator.java (working copy)
@@ -79,7 +79,11 @@
public Listnull no fixing will be performed
+ */
+ public SampleFixer getSampleFixer() {
+ return sampleFixer;
+ }
+
+ /**
+ * Set a {@link SampleFixer} to be used while fixing the sampled results.
+ * {@code null} means no fixing will be performed
+ */
+ public void setSampleFixer(SampleFixer sampleFixer) {
+ this.sampleFixer = sampleFixer;
+ }
+
+ /**
+ * Returns whether over-sampling should be done. By default returns
+ * {@code true} when {@link #getSampleFixer()} is not {@code null} and
+ * {@link #getOversampleFactor()} > 1, {@code false} otherwise.
+ */
+ public boolean shouldOverSample() {
+ return sampleFixer != null && oversampleFactor > 1d;
+ }
+
+}
Index: lucene/facet/src/java/org/apache/lucene/facet/sampling/SamplingWrapper.java
===================================================================
--- lucene/facet/src/java/org/apache/lucene/facet/sampling/SamplingWrapper.java (revision 1486871)
+++ lucene/facet/src/java/org/apache/lucene/facet/sampling/SamplingWrapper.java (working copy)
@@ -52,29 +52,41 @@
public List+ * This fixer is suitable for scenarios which prioritize accuracy over + * performance. + *
+ * Note: for statistically more accurate top-k selection, set
+ * {@link SamplingParams#setOversampleFactor(double) oversampleFactor} to at
+ * least 2, so that the top-k categories would have better chance of showing up
+ * in the sampled top-cK results (see {@link SamplingParams#getOversampleFactor}
*
- *
* @lucene.experimental
*/
-// TODO (Facet): implement also an estimated fixing by ratio (taking into
-// account "translation" of counts!)
-class TakmiSampleFixer implements SampleFixer {
+public class TakmiSampleFixer extends SampleFixer {
private TaxonomyReader taxonomyReader;
private IndexReader indexReader;
@@ -59,29 +62,11 @@
}
@Override
- public void fixResult(ScoredDocIDs origDocIds, FacetResult fres)
- throws IOException {
- FacetResultNode topRes = fres.getFacetResultNode();
- fixResultNode(topRes, origDocIds);
+ public void singleNodeFix(FacetResultNode facetResNode, ScoredDocIDs docIds, double samplingRatio) throws IOException {
+ recount(facetResNode, docIds);
}
/**
- * Fix result node count, and, recursively, fix all its children
- *
- * @param facetResNode
- * result node to be fixed
- * @param docIds
- * docids in effect
- * @throws IOException If there is a low-level I/O error.
- */
- private void fixResultNode(FacetResultNode facetResNode, ScoredDocIDs docIds) throws IOException {
- recount(facetResNode, docIds);
- for (FacetResultNode frn : facetResNode.subResults) {
- fixResultNode(frn, docIds);
- }
- }
-
- /**
* Internal utility: recount for a facet result node
*
* @param fresNode
@@ -179,4 +164,5 @@
}
return false; // exhausted
}
+
}
\ No newline at end of file
Index: lucene/facet/src/java/org/apache/lucene/facet/search/StandardFacetsAccumulator.java
===================================================================
--- lucene/facet/src/java/org/apache/lucene/facet/search/StandardFacetsAccumulator.java (revision 1486871)
+++ lucene/facet/src/java/org/apache/lucene/facet/search/StandardFacetsAccumulator.java (working copy)
@@ -94,7 +94,7 @@
private Object accumulateGuard;
- private double complementThreshold;
+ private double complementThreshold = DEFAULT_COMPLEMENT_THRESHOLD;
public StandardFacetsAccumulator(FacetSearchParams searchParams, IndexReader indexReader,
TaxonomyReader taxonomyReader) {
Index: lucene/facet/src/test/org/apache/lucene/facet/sampling/BaseSampleTestTopK.java
===================================================================
--- lucene/facet/src/test/org/apache/lucene/facet/sampling/BaseSampleTestTopK.java (revision 1486871)
+++ lucene/facet/src/test/org/apache/lucene/facet/sampling/BaseSampleTestTopK.java (working copy)
@@ -94,7 +94,7 @@
for (int nTrial = 0; nTrial < RETRIES; nTrial++) {
try {
// complement with sampling!
- final Sampler sampler = createSampler(nTrial, useRandomSampler);
+ final Sampler sampler = createSampler(nTrial, useRandomSampler, samplingSearchParams);
assertSampling(expectedResults, q, sampler, samplingSearchParams, false);
assertSampling(expectedResults, q, sampler, samplingSearchParams, true);
@@ -128,14 +128,20 @@
return FacetsCollector.create(sfa);
}
- private Sampler createSampler(int nTrial, boolean useRandomSampler) {
+ private Sampler createSampler(int nTrial, boolean useRandomSampler, FacetSearchParams sParams) {
SamplingParams samplingParams = new SamplingParams();
+ /*
+ * Set sampling to Exact fixing with TakmiSampleFixer as it is not easy to
+ * validate results with amortized results.
+ */
+ samplingParams.setSampleFixer(new TakmiSampleFixer(indexReader, taxoReader, sParams));
+
final double retryFactor = Math.pow(1.01, nTrial);
+ samplingParams.setOversampleFactor(5.0 * retryFactor); // Oversampling
samplingParams.setSampleRatio(0.8 * retryFactor);
samplingParams.setMinSampleSize((int) (100 * retryFactor));
samplingParams.setMaxSampleSize((int) (10000 * retryFactor));
- samplingParams.setOversampleFactor(5.0 * retryFactor);
samplingParams.setSamplingThreshold(11000); //force sampling
Sampler sampler = useRandomSampler ?
Index: lucene/facet/src/test/org/apache/lucene/facet/sampling/SamplerTest.java
===================================================================
--- lucene/facet/src/test/org/apache/lucene/facet/sampling/SamplerTest.java (revision 0)
+++ lucene/facet/src/test/org/apache/lucene/facet/sampling/SamplerTest.java (working copy)
@@ -0,0 +1,111 @@
+package org.apache.lucene.facet.sampling;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.lucene.facet.FacetTestBase;
+import org.apache.lucene.facet.params.FacetIndexingParams;
+import org.apache.lucene.facet.params.FacetSearchParams;
+import org.apache.lucene.facet.search.CountFacetRequest;
+import org.apache.lucene.facet.search.FacetResultNode;
+import org.apache.lucene.facet.search.FacetsCollector;
+import org.apache.lucene.facet.search.StandardFacetsAccumulator;
+import org.apache.lucene.facet.taxonomy.CategoryPath;
+import org.apache.lucene.search.MatchAllDocsQuery;
+import org.junit.After;
+import org.junit.Before;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+public class SamplerTest extends FacetTestBase {
+
+ private FacetIndexingParams fip;
+
+ @Override
+ @Before
+ public void setUp() throws Exception {
+ super.setUp();
+ fip = getFacetIndexingParams(Integer.MAX_VALUE);
+ initIndex(fip);
+ }
+
+ @Override
+ protected int numDocsToIndex() {
+ return 100;
+ }
+
+ @Override
+ protected List