diff --git a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java index 7cee344295..d94568dbde 100644 --- a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java +++ b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java @@ -1754,6 +1754,10 @@ private static void populateLlapDaemonVarsSet(Set llapDaemonVarsSetLocal "When estimating output rows for a join involving multiple columns, the default behavior assumes" + "the columns are independent. Setting this flag to true will cause the estimator to assume" + "the columns are correlated."), + HIVE_STATS_LOGICAL_CORRELATED_MULTI_KEY_JOINS("hive.stats.logical.correlated.multi.key.joins", true, + "When estimating output rows for a join involving multiple columns, the default behavior assumes" + + "the columns are independent. Setting this flag to true will cause the estimator to assume" + + "the columns are correlated."), // in the absence of uncompressed/raw data size, total file size will be used for statistics // annotation. But the file may be compressed, encoded and serialized which may be lesser in size // than the actual uncompressed/raw data size. This factor will be multiplied to file size to estimate diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/HiveConfPlannerContext.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/HiveConfPlannerContext.java new file mode 100644 index 0000000000..756b671a83 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/HiveConfPlannerContext.java @@ -0,0 +1,34 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.optimizer.calcite; + +import org.apache.calcite.rel.RelNode; + +import java.util.Set; + +public class HiveConfPlannerContext{ + + private boolean isCorrelatedColumns; + + + public HiveConfPlannerContext(boolean isCorrelatedColumns) { + this.isCorrelatedColumns = isCorrelatedColumns; + } + + public boolean getIsCorrelatedColumns() { return isCorrelatedColumns;} +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/HivePlannerContext.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/HivePlannerContext.java index bdf995548f..56e2f88d79 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/HivePlannerContext.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/HivePlannerContext.java @@ -31,10 +31,11 @@ private HiveRulesRegistry registry; private CalciteConnectionConfig calciteConfig; private SubqueryConf subqueryConfig; + private HiveConfPlannerContext isCorrelatedColumns; public HivePlannerContext(HiveAlgorithmsConf algoConfig, HiveRulesRegistry registry, CalciteConnectionConfig calciteConfig, Set corrScalarRexSQWithAgg, - Set scalarAggNoGbyWindowing) { + Set scalarAggNoGbyWindowing, HiveConfPlannerContext isCorrelatedColumns) { this.algoConfig = algoConfig; this.registry = registry; this.calciteConfig = calciteConfig; @@ -42,6 +43,7 @@ public HivePlannerContext(HiveAlgorithmsConf algoConfig, HiveRulesRegistry regis // this is computed in CalcitePlanner while planning and is later required by subuery remove rule // hence this is passed using HivePlannerContext this.subqueryConfig = new SubqueryConf(corrScalarRexSQWithAgg, scalarAggNoGbyWindowing); + this.isCorrelatedColumns = isCorrelatedColumns; } public T unwrap(Class clazz) { @@ -57,6 +59,9 @@ public HivePlannerContext(HiveAlgorithmsConf algoConfig, HiveRulesRegistry regis if(clazz.isInstance(subqueryConfig)) { return clazz.cast(subqueryConfig); } + if(clazz.isInstance(isCorrelatedColumns)) { + return clazz.cast(isCorrelatedColumns); + } return null; } } \ No newline at end of file diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/HiveRelMdSelectivity.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/HiveRelMdSelectivity.java index 046f51b5a0..ec62b6efee 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/HiveRelMdSelectivity.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/HiveRelMdSelectivity.java @@ -17,30 +17,22 @@ */ package org.apache.hadoop.hive.ql.optimizer.calcite.stats; -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; -import java.util.Map; -import java.util.Set; - +import com.google.common.collect.ImmutableMap; import org.apache.calcite.rel.core.Join; import org.apache.calcite.rel.core.JoinRelType; import org.apache.calcite.rel.core.SemiJoin; -import org.apache.calcite.rel.metadata.ReflectiveRelMetadataProvider; -import org.apache.calcite.rel.metadata.RelMdSelectivity; -import org.apache.calcite.rel.metadata.RelMdUtil; -import org.apache.calcite.rel.metadata.RelMetadataProvider; -import org.apache.calcite.rel.metadata.RelMetadataQuery; +import org.apache.calcite.rel.metadata.*; import org.apache.calcite.rex.RexNode; import org.apache.calcite.util.BuiltInMethod; import org.apache.calcite.util.Pair; import org.apache.hadoop.hive.ql.optimizer.calcite.CalciteSemanticException; import org.apache.hadoop.hive.ql.optimizer.calcite.HiveCalciteUtil.JoinLeafPredicateInfo; import org.apache.hadoop.hive.ql.optimizer.calcite.HiveCalciteUtil.JoinPredicateInfo; +import org.apache.hadoop.hive.ql.optimizer.calcite.HiveConfPlannerContext; import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveJoin; import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveTableScan; -import com.google.common.collect.ImmutableMap; +import java.util.*; public class HiveRelMdSelectivity extends RelMdSelectivity { @@ -123,7 +115,14 @@ private Double computeInnerJoinSelectivity(Join j, RelMetadataQuery mq, RexNode int noOfPE = peLst.size(); double ndvCrossProduct = 1; if (noOfPE > 0) { - ndvCrossProduct = exponentialBackoff(peLst, colStatMap); + boolean isCorrelatedColumns = j.getCluster().getPlanner().getContext(). + unwrap(HiveConfPlannerContext.class).getIsCorrelatedColumns(); + if (noOfPE > 1 && isCorrelatedColumns ){ + ndvCrossProduct = maxNdvForCorrelatedColumns(peLst, colStatMap); + } + else { + ndvCrossProduct = exponentialBackoff(peLst, colStatMap); + } if (j instanceof SemiJoin) { ndvCrossProduct = Math.min(mq.getRowCount(j.getLeft()), @@ -185,6 +184,16 @@ protected double logSmoothing(List peLst, ImmutableMap peLst, + ImmutableMap colStatMap) { + int noOfPE = peLst.size(); + List ndvs = new ArrayList(noOfPE); + for (int i = 0; i < noOfPE; i++) { + ndvs.add(getMaxNDVForJoinSelectivity(peLst.get(i), colStatMap)); + } + return Collections.max(ndvs); + } + /* * a) Order predciates based on ndv in reverse order. b) ndvCrossProduct = * ndv(pe0) * ndv(pe1) ^(1/2) * ndv(pe2) ^(1/4) * ndv(pe3) ^(1/8) ... diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java index d6695ccbf2..5b3119c280 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java @@ -148,6 +148,7 @@ import org.apache.hadoop.hive.ql.optimizer.calcite.HiveCalciteUtil; import org.apache.hadoop.hive.ql.optimizer.calcite.HiveDefaultRelMetadataProvider; import org.apache.hadoop.hive.ql.optimizer.calcite.HivePlannerContext; +import org.apache.hadoop.hive.ql.optimizer.calcite.HiveConfPlannerContext; import org.apache.hadoop.hive.ql.optimizer.calcite.HiveRelFactories; import org.apache.hadoop.hive.ql.optimizer.calcite.HiveRexExecutorImpl; import org.apache.hadoop.hive.ql.optimizer.calcite.HiveTypeSystemImpl; @@ -1334,8 +1335,10 @@ public RelNode apply(RelOptCluster cluster, RelOptSchema relOptSchema, SchemaPlu CalciteConnectionProperty.MATERIALIZATIONS_ENABLED.camelName(), Boolean.FALSE.toString()); CalciteConnectionConfig calciteConfig = new CalciteConnectionConfigImpl(calciteConfigProperties); + boolean isCorrelatedColumns = HiveConf.getBoolVar(conf, + ConfVars.AGGR_JOIN_TRANSPOSE.HIVE_STATS_LOGICAL_CORRELATED_MULTI_KEY_JOINS); HivePlannerContext confContext = new HivePlannerContext(algorithmsConf, registry, calciteConfig, - corrScalarRexSQWithAgg, scalarAggNoGbyNoWin); + corrScalarRexSQWithAgg, scalarAggNoGbyNoWin, new HiveConfPlannerContext(isCorrelatedColumns)); RelOptPlanner planner = HiveVolcanoPlanner.createPlanner(confContext); final RexBuilder rexBuilder = cluster.getRexBuilder(); final RelOptCluster optCluster = RelOptCluster.create(planner, rexBuilder); diff --git a/ql/src/test/org/apache/hadoop/hive/ql/optimizer/calcite/TestCBORuleFiredOnlyOnce.java b/ql/src/test/org/apache/hadoop/hive/ql/optimizer/calcite/TestCBORuleFiredOnlyOnce.java index 884e034731..94d6693431 100644 --- a/ql/src/test/org/apache/hadoop/hive/ql/optimizer/calcite/TestCBORuleFiredOnlyOnce.java +++ b/ql/src/test/org/apache/hadoop/hive/ql/optimizer/calcite/TestCBORuleFiredOnlyOnce.java @@ -62,7 +62,7 @@ public void testRuleFiredOnlyOnce() { // Create rules registry to not trigger a rule more than once HiveRulesRegistry registry = new HiveRulesRegistry(); HivePlannerContext context = new HivePlannerContext(null, registry, null, - null, null); + null, null, null); HepPlanner planner = new HepPlanner(programBuilder.build(), context); // Cluster