From 9f16e37741ebaeaec331a1ecb3400e7a274b869a Mon Sep 17 00:00:00 2001 From: Ashutosh Chauhan Date: Fri, 18 Nov 2016 15:04:30 -0800 Subject: [PATCH] HIVE-15234 : Semijoin cardinality estimation can be improved --- .../ql/optimizer/calcite/HiveRelFactories.java | 4 +-- .../optimizer/calcite/reloperators/HiveJoin.java | 29 +++------------------ .../calcite/reloperators/HiveMultiJoin.java | 2 +- .../calcite/stats/HiveRelMdDistinctRowCount.java | 3 ++- .../calcite/stats/HiveRelMdSelectivity.java | 17 +++++++----- .../ql/optimizer/calcite/stats/HiveRelMdSize.java | 24 +++++++++++++---- .../calcite/translator/HiveOpConverter.java | 30 ++++++++++++---------- .../hadoop/hive/ql/parse/CalcitePlanner.java | 7 +++-- 8 files changed, 59 insertions(+), 57 deletions(-) diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/HiveRelFactories.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/HiveRelFactories.java index a123f63..823b099 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/HiveRelFactories.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/HiveRelFactories.java @@ -146,7 +146,7 @@ public RelNode createFilter(RelNode child, RexNode condition) { @Override public RelNode createJoin(RelNode left, RelNode right, RexNode condition, JoinRelType joinType, Set variablesStopped, boolean semiJoinDone) { - return HiveJoin.getJoin(left.getCluster(), left, right, condition, joinType, false); + return HiveJoin.getJoin(left.getCluster(), left, right, condition, joinType); } @Override @@ -154,7 +154,7 @@ public RelNode createJoin(RelNode left, RelNode right, RexNode condition, Set variablesSet, JoinRelType joinType, boolean semiJoinDone) { // According to calcite, it is going to be removed before Calcite-2.0 // TODO: to handle CorrelationId - return HiveJoin.getJoin(left.getCluster(), left, right, condition, joinType, semiJoinDone); + return HiveJoin.getJoin(left.getCluster(), left, right, condition, joinType); } } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/reloperators/HiveJoin.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/reloperators/HiveJoin.java index 0ad3e81..ba9483e 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/reloperators/HiveJoin.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/reloperators/HiveJoin.java @@ -35,7 +35,6 @@ import org.apache.calcite.rel.core.Join; import org.apache.calcite.rel.core.JoinRelType; import org.apache.calcite.rel.metadata.RelMetadataQuery; -import org.apache.calcite.rel.type.RelDataType; import org.apache.calcite.rel.type.RelDataTypeField; import org.apache.calcite.rex.RexNode; import org.apache.calcite.util.ImmutableBitSet; @@ -58,7 +57,6 @@ NONE, LEFT_RELATION, RIGHT_RELATION } - private final boolean leftSemiJoin; private final RexNode joinFilter; private final JoinPredicateInfo joinPredInfo; private JoinAlgorithm joinAlgorithm; @@ -66,11 +64,11 @@ public static HiveJoin getJoin(RelOptCluster cluster, RelNode left, RelNode right, - RexNode condition, JoinRelType joinType, boolean leftSemiJoin) { + RexNode condition, JoinRelType joinType) { try { Set variablesStopped = Collections.emptySet(); HiveJoin join = new HiveJoin(cluster, null, left, right, condition, joinType, variablesStopped, - DefaultJoinAlgorithm.INSTANCE, leftSemiJoin); + DefaultJoinAlgorithm.INSTANCE); return join; } catch (InvalidRelException | CalciteSemanticException e) { throw new RuntimeException(e); @@ -79,7 +77,7 @@ public static HiveJoin getJoin(RelOptCluster cluster, RelNode left, RelNode righ protected HiveJoin(RelOptCluster cluster, RelTraitSet traits, RelNode left, RelNode right, RexNode condition, JoinRelType joinType, Set variablesStopped, - JoinAlgorithm joinAlgo, boolean leftSemiJoin) throws InvalidRelException, CalciteSemanticException { + JoinAlgorithm joinAlgo) throws InvalidRelException, CalciteSemanticException { super(cluster, TraitsUtil.getDefaultTraitSet(cluster), left, right, condition, joinType, variablesStopped); final List systemFieldList = ImmutableList.of(); @@ -92,7 +90,6 @@ protected HiveJoin(RelOptCluster cluster, RelTraitSet traits, RelNode left, RelN this.getCondition(), joinKeyExprs, filterNulls, null); this.joinPredInfo = HiveCalciteUtil.JoinPredicateInfo.constructJoinPredicateInfo(this); this.joinAlgorithm = joinAlgo; - this.leftSemiJoin = leftSemiJoin; } @Override @@ -105,7 +102,7 @@ public final HiveJoin copy(RelTraitSet traitSet, RexNode conditionExpr, RelNode try { Set variablesStopped = Collections.emptySet(); HiveJoin join = new HiveJoin(getCluster(), traitSet, left, right, conditionExpr, joinType, - variablesStopped, joinAlgorithm, leftSemiJoin); + variablesStopped, joinAlgorithm); // If available, copy state to registry for optimization rules HiveRulesRegistry registry = join.getCluster().getPlanner().getContext().unwrap(HiveRulesRegistry.class); if (registry != null) { @@ -217,10 +214,6 @@ public void setJoinCost(RelOptCost joinCost) { this.joinCost = joinCost; } - public boolean isLeftSemiJoin() { - return leftSemiJoin; - } - /** * Model cost of join as size of Inputs. */ @@ -237,18 +230,4 @@ public RelWriter explainTerms(RelWriter pw) { .item("cost", joinCost == null ? "not available" : joinCost); } - - /** - * @return returns rowtype representing only the left join input - */ - @Override - public RelDataType deriveRowType() { - if (leftSemiJoin) { - return deriveJoinRowType(left.getRowType(), null, JoinRelType.INNER, - getCluster().getTypeFactory(), null, - Collections. emptyList()); - } - return super.deriveRowType(); - } - } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/reloperators/HiveMultiJoin.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/reloperators/HiveMultiJoin.java index cff737c..06279eb 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/reloperators/HiveMultiJoin.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/reloperators/HiveMultiJoin.java @@ -62,7 +62,7 @@ * @param inputs inputs into this multi-join * @param condition join filter applicable to this join node * @param rowType row type of the join result of this node - * @param joinInputs + * @param joinInputs * @param joinTypes the join type corresponding to each input; if * an input is null-generating in a left or right * outer join, the entry indicates the type of diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/HiveRelMdDistinctRowCount.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/HiveRelMdDistinctRowCount.java index e6384a2..7c05b89 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/HiveRelMdDistinctRowCount.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/HiveRelMdDistinctRowCount.java @@ -34,6 +34,7 @@ import org.apache.hadoop.hive.ql.optimizer.calcite.HiveCalciteUtil; import org.apache.hadoop.hive.ql.optimizer.calcite.cost.HiveCost; import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveJoin; +import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveSemiJoin; import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveTableScan; import org.apache.hadoop.hive.ql.plan.ColStatistics; @@ -95,7 +96,7 @@ public Double getDistinctRowCount(Join rel, RelMetadataQuery mq, ImmutableBitSet if (rel instanceof HiveJoin) { HiveJoin hjRel = (HiveJoin) rel; //TODO: Improve this - if (hjRel.isLeftSemiJoin()) { + if (rel instanceof HiveSemiJoin) { return mq.getDistinctRowCount(hjRel.getLeft(), groupKey, rel.getCluster().getRexBuilder().makeLiteral(true)); } else { diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/HiveRelMdSelectivity.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/HiveRelMdSelectivity.java index 651adc0..6b2a07b 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/HiveRelMdSelectivity.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/HiveRelMdSelectivity.java @@ -23,6 +23,7 @@ import java.util.Map; import java.util.Set; +import org.apache.calcite.rel.core.Join; import org.apache.calcite.rel.core.JoinRelType; import org.apache.calcite.rel.metadata.ReflectiveRelMetadataProvider; import org.apache.calcite.rel.metadata.RelMdSelectivity; @@ -36,6 +37,7 @@ import org.apache.hadoop.hive.ql.optimizer.calcite.HiveCalciteUtil.JoinLeafPredicateInfo; import org.apache.hadoop.hive.ql.optimizer.calcite.HiveCalciteUtil.JoinPredicateInfo; import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveJoin; +import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveSemiJoin; import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveTableScan; import com.google.common.collect.ImmutableMap; @@ -61,7 +63,7 @@ public Double getSelectivity(HiveTableScan t, RelMetadataQuery mq, RexNode predi return 1.0; } - public Double getSelectivity(HiveJoin j, RelMetadataQuery mq, RexNode predicate) { + public Double getSelectivity(Join j, RelMetadataQuery mq, RexNode predicate) { if (j.getJoinType().equals(JoinRelType.INNER)) { return computeInnerJoinSelectivity(j, mq, predicate); } else if (j.getJoinType().equals(JoinRelType.LEFT) || @@ -78,8 +80,7 @@ public Double getSelectivity(HiveJoin j, RelMetadataQuery mq, RexNode predicate) return 1.0; } - private Double computeInnerJoinSelectivity(HiveJoin j, RelMetadataQuery mq, RexNode predicate) { - double ndvCrossProduct = 1; + private Double computeInnerJoinSelectivity(Join j, RelMetadataQuery mq, RexNode predicate) { Pair predInfo = getCombinedPredicateForJoin(j, predicate); if (!predInfo.getKey()) { @@ -120,15 +121,19 @@ private Double computeInnerJoinSelectivity(HiveJoin j, RelMetadataQuery mq, RexN // NDV of the join can not exceed the cardinality of cross join. List peLst = jpi.getEquiJoinPredicateElements(); int noOfPE = peLst.size(); + double ndvCrossProduct = 1; if (noOfPE > 0) { ndvCrossProduct = exponentialBackoff(peLst, colStatMap); - if (j.isLeftSemiJoin()) + if (j instanceof HiveSemiJoin) { ndvCrossProduct = Math.min(mq.getRowCount(j.getLeft()), ndvCrossProduct); - else + }else if (j instanceof HiveJoin){ ndvCrossProduct = Math.min(mq.getRowCount(j.getLeft()) * mq.getRowCount(j.getRight()), ndvCrossProduct); + } else { + throw new RuntimeException("Unexpected Join type: " + j.getClass().getName()); + } } // 4. Join Selectivity = 1/NDV @@ -208,7 +213,7 @@ protected double exponentialBackoff(List peLst, * @return if predicate is the join condition return (true, joinCond) * else return (false, minusPred) */ - private Pair getCombinedPredicateForJoin(HiveJoin j, RexNode additionalPredicate) { + private Pair getCombinedPredicateForJoin(Join j, RexNode additionalPredicate) { RexNode minusPred = RelMdUtil.minusPreds(j.getCluster().getRexBuilder(), additionalPredicate, j.getCondition()); diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/HiveRelMdSize.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/HiveRelMdSize.java index 1039f56..c67aa74 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/HiveRelMdSize.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/HiveRelMdSize.java @@ -30,6 +30,7 @@ import org.apache.calcite.util.ImmutableNullableList; import org.apache.hadoop.hive.ql.optimizer.calcite.RelOptHiveTable; import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveJoin; +import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveSemiJoin; import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveTableScan; import org.apache.hadoop.hive.ql.plan.ColStatistics; @@ -77,15 +78,27 @@ private HiveRelMdSize() {} return list.build(); } + public List averageColumnSizes(HiveSemiJoin rel, RelMetadataQuery mq) { + final RelNode left = rel.getLeft(); + final List lefts = + mq.getAverageColumnSizes(left); + if (lefts == null) { + return null; + } + final int fieldCount = rel.getRowType().getFieldCount(); + Double[] sizes = new Double[fieldCount]; + if (lefts != null) { + lefts.toArray(sizes); + } + return ImmutableNullableList.copyOf(sizes); + } + public List averageColumnSizes(HiveJoin rel, RelMetadataQuery mq) { final RelNode left = rel.getLeft(); final RelNode right = rel.getRight(); final List lefts = mq.getAverageColumnSizes(left); - List rights = null; - if (!rel.isLeftSemiJoin()) { - rights = mq.getAverageColumnSizes(right); - } + List rights = mq.getAverageColumnSizes(right); if (lefts == null && rights == null) { return null; } @@ -105,6 +118,7 @@ private HiveRelMdSize() {} // TODO: remove when averageTypeValueSize method RelMdSize // supports all types + @Override public Double averageTypeValueSize(RelDataType type) { switch (type.getSqlTypeName()) { case BOOLEAN: @@ -139,7 +153,7 @@ public Double averageTypeValueSize(RelDataType type) { case BINARY: return (double) type.getPrecision(); case VARBINARY: - return Math.min((double) type.getPrecision(), 100d); + return Math.min(type.getPrecision(), 100d); case CHAR: return (double) type.getPrecision() * BYTES_PER_CHARACTER; case VARCHAR: diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/HiveOpConverter.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/HiveOpConverter.java index 0ead9be..f2d35e9 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/HiveOpConverter.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/HiveOpConverter.java @@ -35,7 +35,9 @@ import org.apache.calcite.rel.RelDistribution.Type; import org.apache.calcite.rel.RelFieldCollation; import org.apache.calcite.rel.RelNode; +import org.apache.calcite.rel.core.Join; import org.apache.calcite.rel.core.JoinRelType; +import org.apache.calcite.rel.type.RelDataTypeField; import org.apache.calcite.rex.RexInputRef; import org.apache.calcite.rex.RexLiteral; import org.apache.calcite.rex.RexNode; @@ -57,6 +59,7 @@ import org.apache.hadoop.hive.ql.io.AcidUtils.Operation; import org.apache.hadoop.hive.ql.metadata.VirtualColumn; import org.apache.hadoop.hive.ql.optimizer.calcite.HiveCalciteUtil; +import org.apache.hadoop.hive.ql.optimizer.calcite.HiveRelOptUtil; import org.apache.hadoop.hive.ql.optimizer.calcite.RelOptHiveTable; import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveAggregate; import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveFilter; @@ -174,10 +177,7 @@ OpAttr dispatch(RelNode rn) throws SemanticException { } else if (rn instanceof HiveJoin) { return visit((HiveJoin) rn); } else if (rn instanceof HiveSemiJoin) { - HiveSemiJoin sj = (HiveSemiJoin) rn; - HiveJoin hj = HiveJoin.getJoin(sj.getCluster(), sj.getLeft(), sj.getRight(), - sj.getCondition(), sj.getJoinType(), true); - return visit(hj); + return visit((HiveSemiJoin)rn); } else if (rn instanceof HiveFilter) { return visit((HiveFilter) rn); } else if (rn instanceof HiveSortLimit) { @@ -328,6 +328,11 @@ OpAttr visit(HiveJoin joinRel) throws SemanticException { return translateJoin(joinRel); } + + OpAttr visit(HiveSemiJoin joinRel) throws SemanticException { + return translateJoin(joinRel); + } + private String getHiveDerivedTableAlias() { return "$hdt$_" + (this.uniqueCounter++); } @@ -356,7 +361,7 @@ private OpAttr translateJoin(RelNode joinRel) throws SemanticException { Set newVcolsInCalcite = new HashSet(); newVcolsInCalcite.addAll(inputs[0].vcolsInCalcite); if (joinRel instanceof HiveMultiJoin || - extractJoinType((HiveJoin)joinRel) != JoinType.LEFTSEMI) { + !(joinRel instanceof HiveSemiJoin)) { int shift = inputs[0].inputs.get(0).getSchema().getSignature().size(); for (int i = 1; i < inputs.length; i++) { newVcolsInCalcite.addAll(HiveCalciteUtil.shiftVColsSet(inputs[i].vcolsInCalcite, shift)); @@ -381,8 +386,11 @@ private OpAttr translateJoin(RelNode joinRel) throws SemanticException { List joinFilters; if (joinRel instanceof HiveJoin) { joinFilters = ImmutableList.of(((HiveJoin)joinRel).getJoinFilter()); - } else { + } else if (joinRel instanceof HiveMultiJoin){ joinFilters = ((HiveMultiJoin)joinRel).getJoinFilters(); + } else { + joinFilters = ImmutableList.of(HiveRelOptUtil.splitHiveJoinCondition(new ArrayList(), joinRel.getInputs(), + ((Join)joinRel).getCondition(), new ArrayList>(), new ArrayList(), null)); } List> filterExpressions = Lists.newArrayList(); for (int i = 0; i< joinFilters.size(); i++) { @@ -889,9 +897,9 @@ private static JoinOperator genJoin(RelNode join, ExprNodeDesc[][] joinExpressio noOuterJoin = !hmj.isOuterJoin(); } else { joinCondns = new JoinCondDesc[1]; - JoinType joinType = extractJoinType((HiveJoin)join); + JoinType joinType = extractJoinType((Join)join); joinCondns[0] = new JoinCondDesc(new JoinCond(0, 1, joinType)); - semiJoin = joinType == JoinType.LEFTSEMI; + semiJoin = join instanceof HiveSemiJoin; noOuterJoin = joinType != JoinType.FULLOUTER && joinType != JoinType.LEFTOUTER && joinType != JoinType.RIGHTOUTER; } @@ -1080,11 +1088,7 @@ private static void updateFilterMap(int[][] filterMap, int inputPos, int joinPos } } - private static JoinType extractJoinType(HiveJoin join) { - // SEMIJOIN - if (join.isLeftSemiJoin()) { - return JoinType.LEFTSEMI; - } + private static JoinType extractJoinType(Join join) { // OUTER AND INNER JOINS JoinType resultJoinType; switch (join.getJoinType()) { diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java index 78011c2..926731c 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java @@ -1187,7 +1187,7 @@ private RelNode applyPreJoinOrderingTransforms(RelNode basePlan, RelMetadataProv // TODO: Decorelation of subquery should be done before attempting // Partition Pruning; otherwise Expression evaluation may try to execute // corelated sub query. - + PerfLogger perfLogger = SessionState.getPerfLogger(); final int maxCNFNodeCount = conf.getIntVar(HiveConf.ConfVars.HIVE_CBO_CNF_NODES_LIMIT); @@ -1651,8 +1651,7 @@ private RelNode genJoinRelNode(RelNode leftRel, RelNode rightRel, JoinType hiveJ LOG.warn("Duplicates detected when adding columns to RR: see previous message"); } } else { - topRel = HiveJoin.getJoin(cluster, leftRel, rightRel, calciteJoinCond, calciteJoinType, - leftSemiJoin); + topRel = HiveJoin.getJoin(cluster, leftRel, rightRel, calciteJoinCond, calciteJoinType); topRR = RowResolver.getCombinedRR(leftRR, rightRR); } @@ -3394,7 +3393,7 @@ private RelNode genUDTFPlan(GenericUDTF genericUDTF, String genericUDTFName, Str for (ColumnInfo ci : rs.getSignature()) { argTypeBldr.add(TypeConverter.convert(ci.getType(), dtFactory)); } - + SqlOperator calciteOp = SqlFunctionConverter.getCalciteOperator(genericUDTFName, genericUDTF, argTypeBldr.build(), retType); -- 1.7.12.4 (Apple Git-37)