diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java index a8ff158..f6c90b7 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java @@ -1032,170 +1032,156 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, int numAttr = 1; AnnotateStatsProcCtx aspCtx = (AnnotateStatsProcCtx) procCtx; HiveConf conf = aspCtx.getConf(); - boolean allStatsAvail = true; boolean allSatisfyPreCondition = true; for (Operator op : parents) { if (op.getStatistics() == null) { - allStatsAvail = false; + return null; } } - if (allStatsAvail) { - - for (Operator op : parents) { - if (!satisfyPrecondition(op.getStatistics())) { - allSatisfyPreCondition = false; - } + for (Operator op : parents) { + if (!satisfyPrecondition(op.getStatistics())) { + allSatisfyPreCondition = false; + break; } + } - if (allSatisfyPreCondition) { - - // statistics object that is combination of statistics from all - // relations involved in JOIN - Statistics stats = new Statistics(); - List distinctVals = Lists.newArrayList(); - int numParent = parents.size(); - Map rowCountParents = Maps.newHashMap(); - Map joinStats = Maps.newHashMap(); - Map> joinKeys = Maps.newHashMap(); - List rowCounts = Lists.newArrayList(); - - // detect if there are multiple attributes in join key - ReduceSinkOperator rsOp = (ReduceSinkOperator) jop.getParentOperators().get(0); - List keyExprs = StatsUtils.getQualifedReducerKeyNames(rsOp.getConf() + if (allSatisfyPreCondition) { + + // statistics object that is combination of statistics from all + // relations involved in JOIN + Statistics stats = new Statistics(); + int numParent = parents.size(); + Map rowCountParents = Maps.newHashMap(); + Map joinStats = Maps.newHashMap(); + Map> joinKeys = Maps.newHashMap(); + List rowCounts = Lists.newArrayList(); + + // detect if there are multiple attributes in join key + ReduceSinkOperator rsOp = (ReduceSinkOperator) jop.getParentOperators().get(0); + List keyExprs = StatsUtils.getQualifedReducerKeyNames(rsOp.getConf() + .getOutputKeyColumnNames()); + numAttr = keyExprs.size(); + + // infer PK-FK relationship in single attribute join case + long inferredRowCount = inferPKFKRelationship(numAttr, parents, jop); + // get the join keys from parent ReduceSink operators + for (int pos = 0; pos < parents.size(); pos++) { + ReduceSinkOperator parent = (ReduceSinkOperator) jop.getParentOperators().get(pos); + Statistics parentStats = parent.getStatistics(); + keyExprs = StatsUtils.getQualifedReducerKeyNames(parent.getConf() .getOutputKeyColumnNames()); - numAttr = keyExprs.size(); - - // infer PK-FK relationship in single attribute join case - long inferredRowCount = inferPKFKRelationship(numAttr, parents, jop); - // get the join keys from parent ReduceSink operators - for (int pos = 0; pos < parents.size(); pos++) { - ReduceSinkOperator parent = (ReduceSinkOperator) jop.getParentOperators().get(pos); - Statistics parentStats = parent.getStatistics(); - keyExprs = StatsUtils.getQualifedReducerKeyNames(parent.getConf() - .getOutputKeyColumnNames()); - rowCountParents.put(pos, parentStats.getNumRows()); - rowCounts.add(parentStats.getNumRows()); + rowCountParents.put(pos, parentStats.getNumRows()); + rowCounts.add(parentStats.getNumRows()); - // internal name for expressions and estimate column statistics for expression. - joinKeys.put(pos, keyExprs); + // internal name for expressions and estimate column statistics for expression. + joinKeys.put(pos, keyExprs); - // get column statistics for all output columns - joinStats.put(pos, parentStats); + // get column statistics for all output columns + joinStats.put(pos, parentStats); - // since new statistics is derived from all relations involved in - // JOIN, we need to update the state information accordingly - stats.updateColumnStatsState(parentStats.getColumnStatsState()); - } + // since new statistics is derived from all relations involved in + // JOIN, we need to update the state information accordingly + stats.updateColumnStatsState(parentStats.getColumnStatsState()); + } - // compute denominator i.e, max(V(R,Y), V(S,Y)) in case of single - // attribute join, else max(V(R,y1), V(S,y1)) * max(V(R,y2), V(S,y2)) + List distinctVals = Lists.newArrayList(); + long denom = 1; + if (inferredRowCount == -1) { + // failed to infer PK-FK relationship for row count estimation fall-back on default logic + // compute denominator max(V(R,y1), V(S,y1)) * max(V(R,y2), V(S,y2)) // in case of multi-attribute join - long denom = 1; - if (numAttr > 1) { - List perAttrDVs = Lists.newArrayList(); - for (int idx = 0; idx < numAttr; idx++) { - for (Integer i : joinKeys.keySet()) { - String col = joinKeys.get(i).get(idx); - ColStatistics cs = joinStats.get(i).getColumnStatisticsFromColName(col); - if (cs != null) { - perAttrDVs.add(cs.getCountDistint()); - } + List perAttrDVs = Lists.newArrayList(); + for (int idx = 0; idx < numAttr; idx++) { + for (Integer i : joinKeys.keySet()) { + String col = joinKeys.get(i).get(idx); + ColStatistics cs = joinStats.get(i).getColumnStatisticsFromColName(col); + if (cs != null) { + perAttrDVs.add(cs.getCountDistint()); } - distinctVals.add(getDenominator(perAttrDVs)); - perAttrDVs.clear(); } + distinctVals.add(getDenominator(perAttrDVs)); + perAttrDVs.clear(); + } - if (numAttr > numParent) { - // To avoid denominator getting larger and aggressively reducing - // number of rows, we will ease out denominator. - denom = getEasedOutDenominator(distinctVals); - } else { - for (Long l : distinctVals) { - denom = StatsUtils.safeMult(denom, l); - } - } + if (numAttr > numParent) { + // To avoid denominator getting larger and aggressively reducing + // number of rows, we will ease out denominator. + denom = getEasedOutDenominator(distinctVals); } else { - if (numAttr == 1) { - for (Integer i : joinKeys.keySet()) { - String col = joinKeys.get(i).get(0); - ColStatistics cs = joinStats.get(i).getColumnStatisticsFromColName(col); - if (cs != null) { - distinctVals.add(cs.getCountDistint()); - } - } + for (Long l : distinctVals) { + denom = StatsUtils.safeMult(denom, l); } - denom = getDenominator(distinctVals); } + } - // Update NDV of joined columns to be min(V(R,y), V(S,y)) - updateJoinColumnsNDV(joinKeys, joinStats, numAttr); - - // column statistics from different sources are put together and - // rename based on output schema of join operator - Map colExprMap = jop.getColumnExprMap(); - RowSchema rs = jop.getSchema(); - List outColStats = Lists.newArrayList(); - for (ColumnInfo ci : rs.getSignature()) { - String key = ci.getInternalName(); - ExprNodeDesc end = colExprMap.get(key); - if (end instanceof ExprNodeColumnDesc) { - String colName = ((ExprNodeColumnDesc) end).getColumn(); - int pos = jop.getConf().getReversedExprs().get(key); - ColStatistics cs = joinStats.get(pos).getColumnStatisticsFromColName(colName); - String outColName = key; - if (cs != null) { - cs.setColumnName(outColName); - } - outColStats.add(cs); + // Update NDV of joined columns to be min(V(R,y), V(S,y)) + updateJoinColumnsNDV(joinKeys, joinStats, numAttr); + + // column statistics from different sources are put together and + // rename based on output schema of join operator + Map colExprMap = jop.getColumnExprMap(); + RowSchema rs = jop.getSchema(); + List outColStats = Lists.newArrayList(); + for (ColumnInfo ci : rs.getSignature()) { + String key = ci.getInternalName(); + ExprNodeDesc end = colExprMap.get(key); + if (end instanceof ExprNodeColumnDesc) { + String colName = ((ExprNodeColumnDesc) end).getColumn(); + int pos = jop.getConf().getReversedExprs().get(key); + ColStatistics cs = joinStats.get(pos).getColumnStatisticsFromColName(colName); + String outColName = key; + if (cs != null) { + cs.setColumnName(outColName); } + outColStats.add(cs); } + } - // update join statistics - stats.setColumnStats(outColStats); - long newRowCount = inferredRowCount !=-1 ? inferredRowCount : computeNewRowCount(rowCounts, denom); - updateStatsForJoinType(stats, newRowCount, jop, rowCountParents); - jop.setStatistics(stats); + // update join statistics + stats.setColumnStats(outColStats); + long newRowCount = inferredRowCount !=-1 ? inferredRowCount : computeNewRowCount(rowCounts, denom); + updateStatsForJoinType(stats, newRowCount, jop, rowCountParents); + jop.setStatistics(stats); - if (isDebugEnabled) { - LOG.debug("[0] STATS-" + jop.toString() + ": " + stats.extendedToString()); - } - } else { + if (isDebugEnabled) { + LOG.debug("[0] STATS-" + jop.toString() + ": " + stats.extendedToString()); + } + } else { - // worst case when there are no column statistics - float joinFactor = HiveConf.getFloatVar(conf, HiveConf.ConfVars.HIVE_STATS_JOIN_FACTOR); - int numParents = parents.size(); - List parentRows = Lists.newArrayList(); - List parentSizes = Lists.newArrayList(); - int maxRowIdx = 0; - long maxRowCount = 0; - int idx = 0; - - for (Operator op : parents) { - Statistics ps = op.getStatistics(); - long rowCount = ps.getNumRows(); - if (rowCount > maxRowCount) { - maxRowCount = rowCount; - maxRowIdx = idx; - } - parentRows.add(rowCount); - parentSizes.add(ps.getDataSize()); - idx++; + // worst case when there are no column statistics + float joinFactor = HiveConf.getFloatVar(conf, HiveConf.ConfVars.HIVE_STATS_JOIN_FACTOR); + int numParents = parents.size(); + List parentRows = Lists.newArrayList(); + List parentSizes = Lists.newArrayList(); + int maxRowIdx = 0; + long maxRowCount = 0; + int idx = 0; + + for (Operator op : parents) { + Statistics ps = op.getStatistics(); + long rowCount = ps.getNumRows(); + if (rowCount > maxRowCount) { + maxRowCount = rowCount; + maxRowIdx = idx; } + parentRows.add(rowCount); + parentSizes.add(ps.getDataSize()); + idx++; + } - long maxDataSize = parentSizes.get(maxRowIdx); - newNumRows = StatsUtils.safeMult(StatsUtils.safeMult(maxRowCount, (numParents - 1)), joinFactor); - long newDataSize = StatsUtils.safeMult(StatsUtils.safeMult(maxDataSize, (numParents - 1)), joinFactor); - Statistics wcStats = new Statistics(); - wcStats.setNumRows(newNumRows); - wcStats.setDataSize(newDataSize); - jop.setStatistics(wcStats); + long maxDataSize = parentSizes.get(maxRowIdx); + newNumRows = StatsUtils.safeMult(StatsUtils.safeMult(maxRowCount, (numParents - 1)), joinFactor); + long newDataSize = StatsUtils.safeMult(StatsUtils.safeMult(maxDataSize, (numParents - 1)), joinFactor); + Statistics wcStats = new Statistics(); + wcStats.setNumRows(newNumRows); + wcStats.setDataSize(newDataSize); + jop.setStatistics(wcStats); - if (isDebugEnabled) { - LOG.debug("[1] STATS-" + jop.toString() + ": " + wcStats.extendedToString()); - } + if (isDebugEnabled) { + LOG.debug("[1] STATS-" + jop.toString() + ": " + wcStats.extendedToString()); } } return null; @@ -1204,44 +1190,46 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, private long inferPKFKRelationship(int numAttr, List> parents, CommonJoinOperator jop) { long newNumRows = -1; - if (numAttr == 1) { - // If numAttr is 1, this means we join on one single key column. - Map parentsWithPK = getPrimaryKeyCandidates(parents); - - // We only allow one single PK. - if (parentsWithPK.size() != 1) { - LOG.debug("STATS-" + jop.toString() + ": detects none/multiple PK parents."); - return newNumRows; - } - Integer pkPos = parentsWithPK.keySet().iterator().next(); - ColStatistics csPK = parentsWithPK.values().iterator().next(); + if (numAttr != 1) { + return newNumRows; + } - // infer foreign key candidates positions - Map csFKs = getForeignKeyCandidates(parents, csPK); + // If numAttr is 1, this means we join on one single key column. + Map parentsWithPK = getPrimaryKeyCandidates(parents); - // we allow multiple foreign keys (snowflake schema) - // csfKs.size() + 1 == parents.size() means we have a single PK and all - // the rest ops are FKs. - if (csFKs.size() + 1 == parents.size()) { - newNumRows = getCardinality(parents, pkPos, csPK, csFKs, jop); + // We only allow one single PK. + if (parentsWithPK.size() != 1) { + LOG.debug("STATS-" + jop.toString() + ": detects none/multiple PK parents."); + return newNumRows; + } + Integer pkPos = parentsWithPK.keySet().iterator().next(); + ColStatistics csPK = parentsWithPK.values().iterator().next(); - // some debug information - if (isDebugEnabled) { - List parentIds = Lists.newArrayList(); + // infer foreign key candidates positions + Map csFKs = getForeignKeyCandidates(parents, csPK); - // print primary key containing parents - for (Integer i : parentsWithPK.keySet()) { - parentIds.add(parents.get(i).toString()); - } - LOG.debug("STATS-" + jop.toString() + ": PK parent id(s) - " + parentIds); - parentIds.clear(); + // we allow multiple foreign keys (snowflake schema) + // csfKs.size() + 1 == parents.size() means we have a single PK and all + // the rest ops are FKs. + if (csFKs.size() + 1 == parents.size()) { + newNumRows = getCardinality(parents, pkPos, csPK, csFKs, jop); - // print foreign key containing parents - for (Integer i : csFKs.keySet()) { - parentIds.add(parents.get(i).toString()); - } - LOG.debug("STATS-" + jop.toString() + ": FK parent id(s) - " + parentIds); + // some debug information + if (isDebugEnabled) { + List parentIds = Lists.newArrayList(); + + // print primary key containing parents + for (Integer i : parentsWithPK.keySet()) { + parentIds.add(parents.get(i).toString()); + } + LOG.debug("STATS-" + jop.toString() + ": PK parent id(s) - " + parentIds); + parentIds.clear(); + + // print foreign key containing parents + for (Integer i : csFKs.keySet()) { + parentIds.add(parents.get(i).toString()); } + LOG.debug("STATS-" + jop.toString() + ": FK parent id(s) - " + parentIds); } } return newNumRows; diff --git a/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java b/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java index 71ed31c..fa997d7 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java @@ -36,6 +36,7 @@ import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj; import org.apache.hadoop.hive.metastore.api.Decimal; import org.apache.hadoop.hive.ql.exec.ColumnInfo; +import org.apache.hadoop.hive.ql.exec.FunctionRegistry; import org.apache.hadoop.hive.ql.exec.RowSchema; import org.apache.hadoop.hive.ql.exec.TableScanOperator; import org.apache.hadoop.hive.ql.exec.Utilities; @@ -1247,7 +1248,7 @@ public static ColStatistics getColStatisticsFromExpression(HiveConf conf, Statis // null projection if (encd.getValue() == null) { colName = encd.getName(); - colType = "null"; + colType = serdeConstants.VOID_TYPE_NAME; numNulls = numRows; } else { colName = encd.getName(); @@ -1261,14 +1262,37 @@ public static ColStatistics getColStatisticsFromExpression(HiveConf conf, Statis ExprNodeGenericFuncDesc engfd = (ExprNodeGenericFuncDesc) end; colName = engfd.getName(); colType = engfd.getTypeString(); - countDistincts = numRows; + List ndvs = new ArrayList<>(); + // ndv of f(c1,c2) = max (ndv(c1) * ndv(c2), rowCnt) + if (FunctionRegistry.isDeterministic(engfd.getGenericUDF())){ + for (String col : engfd.getCols()) { + ColStatistics stats = parentStats.getColumnStatisticsFromColName(col); + if (stats != null) { + ndvs.add(stats.getCountDistint()); + } + } + if (ndvs.isEmpty()) { + countDistincts = numRows; + } else { + countDistincts = 1; + for (Long ndv : ndvs) { + countDistincts = safeMult(countDistincts, ndv); + if (countDistincts > numRows) { + countDistincts = numRows; + break; + } + } + } + } else { + countDistincts = numRows; + } oi = engfd.getWritableObjectInspector(); } else if (end instanceof ExprNodeColumnListDesc) { // column list ExprNodeColumnListDesc encd = (ExprNodeColumnListDesc) end; colName = Joiner.on(",").join(encd.getCols()); - colType = "array"; + colType = serdeConstants.LIST_TYPE_NAME; countDistincts = numRows; oi = encd.getWritableObjectInspector(); } else if (end instanceof ExprNodeFieldDesc) { diff --git a/ql/src/test/results/clientpositive/cbo_rp_auto_join0.q.out b/ql/src/test/results/clientpositive/cbo_rp_auto_join0.q.out index 7822ad9..878175f 100644 --- a/ql/src/test/results/clientpositive/cbo_rp_auto_join0.q.out +++ b/ql/src/test/results/clientpositive/cbo_rp_auto_join0.q.out @@ -71,22 +71,22 @@ STAGE PLANS: 0 1 outputColumnNames: _col0, _col1, _col5, _col6 - Statistics: Num rows: 18 Data size: 6120 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 36 Data size: 12240 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: _col0 (type: string), _col1 (type: string), _col5 (type: string), _col6 (type: string) outputColumnNames: _col0, _col1, _col2, _col3 - Statistics: Num rows: 18 Data size: 6120 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 36 Data size: 12240 Basic stats: COMPLETE Column stats: COMPLETE Reduce Output Operator key expressions: _col0 (type: string), _col1 (type: string), _col2 (type: string), _col3 (type: string) sort order: ++++ - Statistics: Num rows: 18 Data size: 6120 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 36 Data size: 12240 Basic stats: COMPLETE Column stats: COMPLETE Local Work: Map Reduce Local Work Reduce Operator Tree: Select Operator expressions: KEY.reducesinkkey0 (type: string), KEY.reducesinkkey1 (type: string), KEY.reducesinkkey2 (type: string), KEY.reducesinkkey3 (type: string) outputColumnNames: _col0, _col1, _col2, _col3 - Statistics: Num rows: 18 Data size: 6120 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 36 Data size: 12240 Basic stats: COMPLETE Column stats: COMPLETE Group By Operator aggregations: sum(hash(_col0,_col1,_col2,_col3)) mode: hash @@ -204,22 +204,22 @@ STAGE PLANS: 0 1 outputColumnNames: _col0, _col1, _col5, _col6 - Statistics: Num rows: 18 Data size: 6120 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 36 Data size: 12240 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: _col0 (type: string), _col1 (type: string), _col5 (type: string), _col6 (type: string) outputColumnNames: _col0, _col1, _col2, _col3 - Statistics: Num rows: 18 Data size: 6120 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 36 Data size: 12240 Basic stats: COMPLETE Column stats: COMPLETE Reduce Output Operator key expressions: _col0 (type: string), _col1 (type: string), _col2 (type: string), _col3 (type: string) sort order: ++++ - Statistics: Num rows: 18 Data size: 6120 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 36 Data size: 12240 Basic stats: COMPLETE Column stats: COMPLETE Local Work: Map Reduce Local Work Reduce Operator Tree: Select Operator expressions: KEY.reducesinkkey0 (type: string), KEY.reducesinkkey1 (type: string), KEY.reducesinkkey2 (type: string), KEY.reducesinkkey3 (type: string) outputColumnNames: _col0, _col1, _col2, _col3 - Statistics: Num rows: 18 Data size: 6120 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 36 Data size: 12240 Basic stats: COMPLETE Column stats: COMPLETE Group By Operator aggregations: sum(hash(_col0,_col1,_col2,_col3)) mode: hash diff --git a/ql/src/test/results/clientpositive/tez/explainuser_1.q.out b/ql/src/test/results/clientpositive/tez/explainuser_1.q.out index a3d1f87..2b7eac6 100644 --- a/ql/src/test/results/clientpositive/tez/explainuser_1.q.out +++ b/ql/src/test/results/clientpositive/tez/explainuser_1.q.out @@ -2954,11 +2954,11 @@ Stage-0 Statistics:Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE Select Operator [SEL_10] outputColumnNames:["_col0"] - Statistics:Num rows: 200 Data size: 0 Basic stats: PARTIAL Column stats: COMPLETE + Statistics:Num rows: 400 Data size: 0 Basic stats: PARTIAL Column stats: COMPLETE Merge Join Operator [MERGEJOIN_19] | condition map:[{"":"Inner Join 0 to 1"}] | keys:{} - | Statistics:Num rows: 200 Data size: 0 Basic stats: PARTIAL Column stats: COMPLETE + | Statistics:Num rows: 400 Data size: 0 Basic stats: PARTIAL Column stats: COMPLETE |<-Map 1 [SIMPLE_EDGE] | Reduce Output Operator [RS_7] | sort order: @@ -5175,13 +5175,13 @@ Stage-0 Reducer 2 File Output Operator [FS_9] compressed:true - Statistics:Num rows: 125000 Data size: 10875000 Basic stats: COMPLETE Column stats: COMPLETE + Statistics:Num rows: 250000 Data size: 21750000 Basic stats: COMPLETE Column stats: COMPLETE table:{"input format:":"org.apache.hadoop.mapred.TextInputFormat","output format:":"org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat","serde:":"org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe"} Merge Join Operator [MERGEJOIN_11] | condition map:[{"":"Inner Join 0 to 1"}] | keys:{} | outputColumnNames:["_col0"] - | Statistics:Num rows: 125000 Data size: 10875000 Basic stats: COMPLETE Column stats: COMPLETE + | Statistics:Num rows: 250000 Data size: 21750000 Basic stats: COMPLETE Column stats: COMPLETE |<-Map 1 [SIMPLE_EDGE] | Reduce Output Operator [RS_5] | sort order: @@ -5417,21 +5417,21 @@ Stage-0 Reducer 3 File Output Operator [FS_12] compressed:true - Statistics:Num rows: 13778 Data size: 4904968 Basic stats: COMPLETE Column stats: COMPLETE + Statistics:Num rows: 27556 Data size: 9809936 Basic stats: COMPLETE Column stats: COMPLETE table:{"input format:":"org.apache.hadoop.mapred.TextInputFormat","output format:":"org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat","serde:":"org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe"} Select Operator [SEL_11] | outputColumnNames:["_col0","_col1","_col2","_col3"] - | Statistics:Num rows: 13778 Data size: 4904968 Basic stats: COMPLETE Column stats: COMPLETE + | Statistics:Num rows: 27556 Data size: 9809936 Basic stats: COMPLETE Column stats: COMPLETE |<-Reducer 2 [SIMPLE_EDGE] Reduce Output Operator [RS_10] key expressions:_col0 (type: string), _col1 (type: string), _col2 (type: string), _col3 (type: string) sort order:++++ - Statistics:Num rows: 13778 Data size: 4904968 Basic stats: COMPLETE Column stats: COMPLETE + Statistics:Num rows: 27556 Data size: 9809936 Basic stats: COMPLETE Column stats: COMPLETE Merge Join Operator [MERGEJOIN_15] | condition map:[{"":"Inner Join 0 to 1"}] | keys:{} | outputColumnNames:["_col0","_col1","_col2","_col3"] - | Statistics:Num rows: 13778 Data size: 4904968 Basic stats: COMPLETE Column stats: COMPLETE + | Statistics:Num rows: 27556 Data size: 9809936 Basic stats: COMPLETE Column stats: COMPLETE |<-Map 1 [SIMPLE_EDGE] | Reduce Output Operator [RS_6] | sort order: