From 80333ccd0b3eb7a7df40f10f35c138b8218016c0 Mon Sep 17 00:00:00 2001
From: Vikas Saurabh <vsaurabh@adobe.com>
Date: Tue, 31 Oct 2017 07:41:41 +0530
Subject: [PATCH] OAK-6735: Lucene Index: improved cost estimation by using
 document count per field

---
 .../oak/plugins/index/lucene/IndexNode.java        |   2 +
 .../oak/plugins/index/lucene/IndexNodeManager.java |  11 +++
 .../oak/plugins/index/lucene/IndexPlanner.java     |  55 +++++++----
 .../oak/plugins/index/lucene/IndexStatistics.java  | 104 +++++++++++++++++++++
 .../oak/plugins/index/lucene/LuceneIndex.java      |   2 +-
 .../oak/plugins/index/lucene/IndexPlannerTest.java |  15 +--
 .../MultiplexingLucenePropertyIndexTest.java       |  13 ++-
 7 files changed, 175 insertions(+), 27 deletions(-)
 create mode 100644 oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/IndexStatistics.java

diff --git a/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/IndexNode.java b/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/IndexNode.java
index 15deaec575..6cdee32695 100644
--- a/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/IndexNode.java
+++ b/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/IndexNode.java
@@ -36,6 +36,8 @@ public interface IndexNode {
 
     IndexSearcher getSearcher();
 
+    IndexStatistics getIndexStatistics();
+
     IndexDefinition getDefinition();
 
     List<LuceneIndexReader> getPrimaryReaders();
diff --git a/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/IndexNodeManager.java b/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/IndexNodeManager.java
index 3ff4479cd6..7cb5c8632e 100644
--- a/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/IndexNodeManager.java
+++ b/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/IndexNodeManager.java
@@ -275,10 +275,16 @@ public class IndexNodeManager {
         final IndexSearcher searcher;
         final List<LuceneIndexReader> nrtReaders;
         final int searcherId = SEARCHER_ID_COUNTER.incrementAndGet();
+        final IndexStatistics indexStatistics;
 
         public SearcherHolder(IndexSearcher searcher, List<LuceneIndexReader> nrtReaders) {
             this.searcher = searcher;
             this.nrtReaders = nrtReaders;
+            this.indexStatistics = new IndexStatistics(searcher.getIndexReader());
+        }
+
+        public IndexStatistics getIndexStatistics() {
+            return indexStatistics;
         }
     }
 
@@ -307,6 +313,11 @@ public class IndexNodeManager {
             return holder.searcher;
         }
 
+        @Override
+        public IndexStatistics getIndexStatistics() {
+            return holder.getIndexStatistics();
+        }
+
         @Override
         public IndexDefinition getDefinition() {
             return definition;
diff --git a/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/IndexPlanner.java b/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/IndexPlanner.java
index 5f21d062b6..4e28180ba5 100644
--- a/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/IndexPlanner.java
+++ b/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/IndexPlanner.java
@@ -48,7 +48,6 @@ import org.apache.jackrabbit.oak.spi.query.fulltext.FullTextTerm;
 import org.apache.jackrabbit.oak.spi.query.fulltext.FullTextVisitor;
 import org.apache.jackrabbit.oak.spi.query.Filter;
 import org.apache.jackrabbit.oak.spi.query.QueryConstants;
-import org.apache.lucene.index.IndexReader;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -255,26 +254,14 @@ class IndexPlanner {
         boolean canSort = canSortByProperty(sortOrder);
         if (!indexedProps.isEmpty() || canSort || ft != null
                 || evalPathRestrictions || evalNodeTypeRestrictions || canEvalNodeNameRestriction) {
-            //TODO Need a way to have better cost estimate to indicate that
-            //this index can evaluate more propertyRestrictions natively (if more props are indexed)
-            //For now we reduce cost per entry
-
-            //Use propDefns instead of indexedProps as it determines true count of property restrictions
-            //which are evaluated by this index
-            int costPerEntryFactor = result.propDefns.size();
+            int costPerEntryFactor = 1;
             costPerEntryFactor += sortOrder.size();
 
-            //this index can evaluate more propertyRestrictions natively (if more props are indexed)
-            //For now we reduce cost per entry
             IndexPlan.Builder plan = defaultPlan();
             if (!sortOrder.isEmpty()) {
                 plan.setSortOrder(sortOrder);
             }
 
-            if (costPerEntryFactor == 0){
-                costPerEntryFactor = 1;
-            }
-
             if (facetFields.size() > 0) {
                 plan.setAttribute(FacetHelper.ATTR_FACET_FIELDS, facetFields);
             }
@@ -291,6 +278,11 @@ class IndexPlanner {
                 result.enableNodeNameRestriction();
             }
 
+            // Set a index based guess here. Unique would set its own value below
+            if (useActualEntryCount && !definition.isEntryCountDefined()) {
+                plan.setEstimatedEntryCount(getMaxPossibleNumDocs(result.propDefns));
+            }
+
             if (sortOrder.isEmpty() && ft == null) {
                 boolean uniqueIndexFound = planForSyncIndexes(indexingRule);
                 if (uniqueIndexFound) {
@@ -717,7 +709,7 @@ class IndexPlanner {
     }
 
     private long estimatedEntryCount() {
-        int numOfDocs = getReader().numDocs();
+        int numOfDocs = getNumDocs();
         if (useActualEntryCount) {
             return definition.isEntryCountDefined() ? definition.getEntryCount() : numOfDocs;
         } else {
@@ -742,8 +734,37 @@ class IndexPlanner {
         return PathUtils.denotesRoot(parentPath) ? "" : parentPath;
     }
 
-    private IndexReader getReader() {
-        return indexNode.getSearcher().getIndexReader();
+    private int getNumDocs() {
+        return indexNode.getIndexStatistics().numDocs();
+    }
+
+    private int getMaxPossibleNumDocs(Map<String, PropertyDefinition> propDefns) {
+        IndexStatistics indexStatistics = indexNode.getIndexStatistics();
+        int minNumDocs = indexStatistics.numDocs();
+        for (Map.Entry<String, PropertyDefinition> propDef : propDefns.entrySet()) {
+            int docCntForField = indexStatistics.getDocCountFor(propDef.getKey());
+            if (docCntForField == -1) {
+                continue;
+            }
+
+            int weight = propDef.getValue().weight;
+
+            if (weight > 1) {
+                // use it to scale down the doc count - in broad strokes, we can think of weight
+                // as number of terms for the field with all terms getting equal share of
+                // the documents in this field
+                double scaledDocCnt = Math.ceil((double) docCntForField / weight);
+                if (minNumDocs < scaledDocCnt) {
+                    continue;
+                }
+                // since, we've already taken care that scaled cost is lower than minCost,
+                // we can safely cast without risking overflow
+                minNumDocs = (int)scaledDocCnt;
+            } else if (docCntForField < minNumDocs) {
+                minNumDocs = docCntForField;
+            }
+        }
+        return minNumDocs;
     }
 
     private List<OrderEntry> createSortOrder(IndexingRule rule) {
diff --git a/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/IndexStatistics.java b/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/IndexStatistics.java
new file mode 100644
index 0000000000..9b843ce15c
--- /dev/null
+++ b/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/IndexStatistics.java
@@ -0,0 +1,104 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jackrabbit.oak.plugins.index.lucene;
+
+import com.google.common.collect.Maps;
+import org.apache.lucene.index.Fields;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.MultiFields;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.Collections;
+import java.util.Map;
+
+/**
+ * This class would populate some statistics from a reader. We want to be careful here such that
+ * we only collect statistics which don't incur reads from the index i.e. we would only collect
+ * stats that lucene would already have read into memory when the reader was opened.
+ */
+public class IndexStatistics {
+    static final Logger LOG = LoggerFactory.getLogger(IndexStatistics.class);
+    private final int numDocs;
+    private final Map<String, Integer> numDocsForField;
+
+    /**
+     * @param reader {@link IndexReader} for which statistics need to be collected.
+     */
+    public IndexStatistics(IndexReader reader) {
+        numDocs = reader.numDocs();
+
+        Map<String, Integer> numDocsForField = Maps.newHashMap();
+
+        Fields fields = null;
+        try {
+            fields = MultiFields.getFields(reader);
+        } catch (IOException e) {
+            LOG.warn("Couldn't open fields for reader ({}). Won't extract doc count per field", reader);
+            numDocsForField = null;
+        }
+
+        if (fields != null) {
+            for(String f : fields) {
+                if (isPropertyField(f)) {
+                    int docCntForField = numDocs;
+                    try {
+                        docCntForField = reader.getDocCount(f);
+                    } catch (IOException e) {
+                        LOG.warn("Couldn't read doc count for field {} via reader ({}). Would use numDocs for this field");
+                    }
+                    numDocsForField.put(f, docCntForField);
+                }
+            }
+        }
+
+        if (numDocsForField != null) {
+            this.numDocsForField = Collections.unmodifiableMap(numDocsForField);
+        } else {
+            this.numDocsForField = null;
+        }
+    }
+
+    /**
+     * @return number of documents in the index
+     */
+    public int numDocs() {
+        return numDocs;
+    }
+
+    /**
+     * @param field Index field for which number of indexed documents are to be return
+     * @return number of indexed documents (without subtracting potentially deleted ones)
+     *         for the given {@code field}.
+     */
+    public int getDocCountFor(String field) {
+        int docCntForField = isPropertyField(field) ? 0 : -1;
+        if (numDocsForField.containsKey(field)) {
+            docCntForField = numDocsForField.get(field);
+        }
+
+        return docCntForField;
+    }
+
+    private boolean isPropertyField(String field) {
+        return !field.startsWith(FieldNames.ANALYZED_FIELD_PREFIX)
+                && !field.startsWith(FieldNames.FULLTEXT_RELATIVE_NODE)
+                && !field.startsWith(":")
+                && !field.endsWith("_facet");
+    }
+}
diff --git a/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndex.java b/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndex.java
index 3fbb73f0ce..f7b64abe93 100644
--- a/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndex.java
+++ b/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndex.java
@@ -211,7 +211,7 @@ public class LuceneIndex implements AdvanceFulltextQueryIndex {
             if (node != null){
                 IndexDefinition defn = node.getDefinition();
                 return Collections.singletonList(planBuilder(filter)
-                        .setEstimatedEntryCount(defn.getFulltextEntryCount(node.getSearcher().getIndexReader().numDocs()))
+                        .setEstimatedEntryCount(defn.getFulltextEntryCount(node.getIndexStatistics().numDocs()))
                         .setCostPerExecution(defn.getCostPerExecution())
                         .setCostPerEntry(defn.getCostPerEntry())
                         .setAttribute(ATTR_INDEX_PATH, indexPath)
diff --git a/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/IndexPlannerTest.java b/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/IndexPlannerTest.java
index 4be18386f9..5f6e96e433 100644
--- a/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/IndexPlannerTest.java
+++ b/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/IndexPlannerTest.java
@@ -519,20 +519,20 @@ public class IndexPlannerTest {
     @Test
     public void indexedButZeroWeightProps() throws Exception{
         IndexDefinitionBuilder defnb = new IndexDefinitionBuilder();
-        defnb.indexRule("nt:base").property("foo").propertyIndex().weight(0);
-        defnb.indexRule("nt:base").property("bar").propertyIndex();
+        defnb.indexRule("nt:base").property("foo").propertyIndex();
+        defnb.indexRule("nt:base").property("bar").propertyIndex().weight(0);
 
         IndexDefinition defn = new IndexDefinition(root, defnb.build(), "/foo");
         IndexNode node = createIndexNode(defn);
 
         FilterImpl filter = createFilter("nt:base");
-        filter.restrictProperty("foo", Operator.EQUAL, PropertyValues.newString("a"));
+        filter.restrictProperty("bar", Operator.EQUAL, PropertyValues.newString("a"));
         IndexPlanner planner = new IndexPlanner(node, "/foo", filter, Collections.<OrderEntry>emptyList());
         //Even though foo is indexed it would not be considered for a query involving just foo
         assertNull(planner.getPlan());
 
         filter = createFilter("nt:base");
-        filter.restrictProperty("bar", Operator.EQUAL, PropertyValues.newString("a"));
+        filter.restrictProperty("foo", Operator.EQUAL, PropertyValues.newString("a"));
         planner = new IndexPlanner(node, "/foo", filter, Collections.<OrderEntry>emptyList());
         QueryIndex.IndexPlan plan1 = planner.getPlan();
         assertNotNull(plan1);
@@ -544,9 +544,10 @@ public class IndexPlannerTest {
         QueryIndex.IndexPlan plan2 = planner.getPlan();
         assertNotNull(plan2);
 
-        //For plan2 as 2 props are indexed its costPerEntry should be less than plan1 which
-        //indexes only one prop
-        assertThat(plan2.getCostPerEntry(), lessThan(plan1.getCostPerEntry()));
+
+        // Since, the index has no entries for "bar", estimated entry count for plan2 would be 0
+        assertEquals(0, plan2.getEstimatedEntryCount());
+        assertThat(plan2.getEstimatedEntryCount(), lessThan(plan1.getEstimatedEntryCount()));
 
         assertTrue(pr(plan2).hasProperty("foo"));
         assertTrue(pr(plan2).hasProperty("bar"));
diff --git a/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/MultiplexingLucenePropertyIndexTest.java b/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/MultiplexingLucenePropertyIndexTest.java
index e82193b01d..4dff7a2e15 100644
--- a/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/MultiplexingLucenePropertyIndexTest.java
+++ b/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/MultiplexingLucenePropertyIndexTest.java
@@ -71,6 +71,9 @@ import org.apache.jackrabbit.oak.spi.state.NodeBuilder;
 import org.apache.jackrabbit.oak.spi.state.NodeState;
 import org.apache.jackrabbit.oak.spi.state.NodeStateUtils;
 import org.apache.jackrabbit.oak.spi.state.NodeStore;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.StringField;
 import org.junit.Rule;
 import org.junit.Test;
 import org.junit.rules.TemporaryFolder;
@@ -136,8 +139,14 @@ public class MultiplexingLucenePropertyIndexTest extends AbstractQueryTest {
         LuceneIndexWriterFactory factory = new DefaultIndexWriterFactory(mip, directoryFactory, new LuceneIndexWriterConfig());
         LuceneIndexWriter writer = factory.newInstance(defn, builder, true);
 
-        writer.updateDocument("/content/en", newDoc("/content/en"));
-        writer.updateDocument("/libs/config", newDoc("/libs/config"));
+        Document doc = newDoc("/content/en");
+        doc.add(new StringField("foo", "bar", Field.Store.NO));
+        writer.updateDocument("/content/en", doc);
+
+        doc = newDoc("/libs/config");
+        doc.add(new StringField("foo", "baz", Field.Store.NO));
+        writer.updateDocument("/libs/config", doc);
+
         writer.close(0);
 
         //2. Construct the readers
-- 
2.14.1

