diff --git common/src/java/org/apache/hadoop/hive/conf/HiveConf.java common/src/java/org/apache/hadoop/hive/conf/HiveConf.java index 3777fa9..08422d5 100644 --- common/src/java/org/apache/hadoop/hive/conf/HiveConf.java +++ common/src/java/org/apache/hadoop/hive/conf/HiveConf.java @@ -1422,6 +1422,11 @@ private static void populateLlapDaemonVarsSet(Set llapDaemonVarsSetLocal "This controls how many partitions can be scanned for each partitioned table.\n" + "The default value \"-1\" means no limit. (DEPRECATED: Please use " + ConfVars.METASTORE_LIMIT_PARTITION_REQUEST + " in the metastore instead.)"), + HIVECONVERTJOINMAXENTRIESHASHTABLE("hive.auto.convert.join.hashtable.max.entries", 4194304L, + "If hive.auto.convert.join.noconditionaltask is off, this parameter does not take affect. \n" + + "However, if it is on, and the predicated number of entries in hashtable for a given join \n" + + "input is larger than this number, the join will not be converted to a mapjoin. \n" + + "The value \"-1\" means no limit."), HIVEHASHTABLEKEYCOUNTADJUSTMENT("hive.hashtable.key.count.adjustment", 1.0f, "Adjustment to mapjoin hashtable size derived from table and column statistics; the estimate" + " of the number of keys is divided by this value. If the value is 0, statistics are not used" + diff --git itests/src/test/resources/testconfiguration.properties itests/src/test/resources/testconfiguration.properties index 4a69bcc..751530c 100644 --- itests/src/test/resources/testconfiguration.properties +++ itests/src/test/resources/testconfiguration.properties @@ -501,6 +501,7 @@ minillaplocal.query.files=acid_globallimit.q,\ join1.q,\ join_acid_non_acid.q,\ join_filters.q,\ + join_max_hashtable.q,\ join_nulls.q,\ join_nullsafe.q,\ leftsemijoin_mr.q,\ diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/ConvertJoinMapJoin.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/ConvertJoinMapJoin.java index 93e3631..cd87994 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/ConvertJoinMapJoin.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/ConvertJoinMapJoin.java @@ -19,7 +19,6 @@ package org.apache.hadoop.hive.ql.optimizer; import java.util.ArrayList; -import java.util.Collection; import java.util.HashMap; import java.util.HashSet; import java.util.List; @@ -53,6 +52,7 @@ import org.apache.hadoop.hive.ql.parse.OptimizeTezProcContext; import org.apache.hadoop.hive.ql.parse.ParseContext; import org.apache.hadoop.hive.ql.parse.SemanticException; +import org.apache.hadoop.hive.ql.plan.ColStatistics; import org.apache.hadoop.hive.ql.plan.CommonMergeJoinDesc; import org.apache.hadoop.hive.ql.plan.DynamicPruningEventDesc; import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc; @@ -63,6 +63,7 @@ import org.apache.hadoop.hive.ql.plan.OpTraits; import org.apache.hadoop.hive.ql.plan.OperatorDesc; import org.apache.hadoop.hive.ql.plan.Statistics; +import org.apache.hadoop.hive.ql.stats.StatsUtils; import org.apache.hadoop.util.ReflectionUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -117,7 +118,7 @@ numBuckets = 1; } LOG.info("Estimated number of buckets " + numBuckets); - int mapJoinConversionPos = getMapJoinConversionPos(joinOp, context, numBuckets, false, maxSize); + int mapJoinConversionPos = getMapJoinConversionPos(joinOp, context, numBuckets, false, maxSize, true); if (mapJoinConversionPos < 0) { Object retval = checkAndConvertSMBJoin(context, joinOp, tezBucketJoinProcCtx); if (retval == null) { @@ -141,7 +142,7 @@ // check if we can convert to map join no bucket scaling. LOG.info("Convert to non-bucketed map join"); if (numBuckets != 1) { - mapJoinConversionPos = getMapJoinConversionPos(joinOp, context, 1, false, maxSize); + mapJoinConversionPos = getMapJoinConversionPos(joinOp, context, 1, false, maxSize, true); } if (mapJoinConversionPos < 0) { // we are just converting to a common merge join operator. The shuffle @@ -520,7 +521,8 @@ private boolean checkColEquality(List> grandParentColNames, } public int getMapJoinConversionPos(JoinOperator joinOp, OptimizeTezProcContext context, - int buckets, boolean skipJoinTypeChecks, long maxSize) throws SemanticException { + int buckets, boolean skipJoinTypeChecks, long maxSize, boolean checkHashTableEntries) + throws SemanticException { if (!skipJoinTypeChecks) { /* * HIVE-9038: Join tests fail in tez when we have more than 1 join on the same key and there is @@ -628,10 +630,22 @@ public int getMapJoinConversionPos(JoinOperator joinOp, OptimizeTezProcContext c // We are replacing the current big table with a new one, thus // we need to count the current one as a map table then. totalSize += bigInputStat.getDataSize(); + // Check if number of distinct keys is larger than given max + // number of entries for HashMap. If it is, we do not convert. + if (checkHashTableEntries && !checkNumberOfEntriesForHashTable(joinOp, bigTablePosition, context)) { + LOG.info("Number of different entries for HashTable is greater than the max."); + return -1; + } } else if (!selectedBigTable) { // This is not the first table and we are not using it as big table, // in fact, we're adding this table as a map table totalSize += inputSize; + // Check if number of distinct keys is larger than given max + // number of entries for HashMap. If it is, we do not convert. + if (checkHashTableEntries && !checkNumberOfEntriesForHashTable(joinOp, pos, context)) { + LOG.info("Number of different entries for HashTable is greater than the max."); + return -1; + } } if (totalSize/buckets > maxSize) { @@ -905,8 +919,8 @@ private boolean convertJoinDynamicPartitionedHashJoin(JoinOperator joinOp, Optim int numReducers = estimateNumBuckets(joinOp, false); LOG.info("Try dynamic partitioned hash join with estimated " + numReducers + " reducers"); int bigTablePos = getMapJoinConversionPos(joinOp, context, numReducers, false, - context.conf.getLongVar( - HiveConf.ConfVars.HIVECONVERTJOINNOCONDITIONALTASKTHRESHOLD)); + context.conf.getLongVar(HiveConf.ConfVars.HIVECONVERTJOINNOCONDITIONALTASKTHRESHOLD), + false); if (bigTablePos >= 0) { // Now that we have the big table index, get real numReducers value based on big table RS ReduceSinkOperator bigTableParentRS = @@ -951,7 +965,7 @@ private void fallbackToReduceSideJoin(JoinOperator joinOp, OptimizeTezProcContex } int pos = getMapJoinConversionPos(joinOp, context, estimateNumBuckets(joinOp, false), - true, Long.MAX_VALUE); + true, Long.MAX_VALUE, false); if (pos < 0) { LOG.info("Could not get a valid join position. Defaulting to position 0"); pos = 0; @@ -961,4 +975,69 @@ private void fallbackToReduceSideJoin(JoinOperator joinOp, OptimizeTezProcContex LOG.info("Fallback to common merge join operator"); convertJoinSMBJoin(joinOp, context, pos, 0, false); } + + /* Returns true if it passes the test, false otherwise. */ + private boolean checkNumberOfEntriesForHashTable(JoinOperator joinOp, int position, + OptimizeTezProcContext context) { + long max = HiveConf.getLongVar(context.parseContext.getConf(), + HiveConf.ConfVars.HIVECONVERTJOINMAXENTRIESHASHTABLE); + if (max == -1) { + // Max is disabled, we can safely return true + return true; + } + // Calculate number of different entries and evaluate + ReduceSinkOperator rsOp = (ReduceSinkOperator) joinOp.getParentOperators().get(position); + List keys = StatsUtils.getQualifedReducerKeyNames(rsOp.getConf().getOutputKeyColumnNames()); + Statistics inputStats = rsOp.getStatistics(); + List columnStats = new ArrayList<>(); + for (String key : keys) { + ColStatistics cs = inputStats.getColumnStatisticsFromColName(key); + if (cs == null) { + LOG.warn("Couldn't get statistics for: " + key); + return true; + } + columnStats.add(cs); + } + long numRows = inputStats.getNumRows(); + long estimation = estimateNDV(numRows, columnStats); + LOG.info("Estimated NDV for input {}: {}; Max NDV for MapJoin conversion: {}", + position, estimation, max); + if (estimation > max) { + // Estimation larger than max + return false; + } + // We can proceed with the conversion + return true; + } + + private static long estimateNDV(long numRows, List columnStats) { + // If there is a single column, return the number of distinct values + if (columnStats.size() == 1) { + return columnStats.get(0).getCountDistint(); + } + + // The expected number of distinct values when choosing p values + // with replacement from n integers is n . (1 - ((n - 1) / n) ^ p). + // + // If we have several uniformly distributed attributes A1 ... Am + // with N1 ... Nm distinct values, they behave as one uniformly + // distributed attribute with N1 * ... * Nm distinct values. + long n = 1L; + for (ColStatistics cs : columnStats) { + final long ndv = cs.getCountDistint(); + if (ndv > 1) { + n = StatsUtils.safeMult(n, ndv); + } + } + final double nn = (double) n; + final double a = (nn - 1d) / nn; + if (a == 1d) { + // A under-flows if nn is large. + return numRows; + } + final double v = nn * (1d - Math.pow(a, numRows)); + // Cap at fact-row-count, because numerical artifacts can cause it + // to go a few % over. + return Math.min(Math.round(v), numRows); + } } diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java index 61f1374..ecf23a9 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java @@ -1409,7 +1409,12 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, // get the join keys from parent ReduceSink operators for (int pos = 0; pos < parents.size(); pos++) { ReduceSinkOperator parent = (ReduceSinkOperator) jop.getParentOperators().get(pos); - Statistics parentStats = parent.getStatistics(); + Statistics parentStats; + try { + parentStats = parent.getStatistics().clone(); + } catch (CloneNotSupportedException e) { + throw new SemanticException(ErrorMsg.STATISTICS_CLONING_FAILED.getMsg()); + } keyExprs = StatsUtils.getQualifedReducerKeyNames(parent.getConf() .getOutputKeyColumnNames()); diff --git ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java index 0da7ea4..e48b609 100644 --- ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java +++ ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java @@ -1278,7 +1278,11 @@ public static ColStatistics getColStatisticsFromExpression(HiveConf conf, Statis ColStatistics colStats = parentStats.getColumnStatisticsFromColName(colName); if (colStats != null) { /* If statistics for the column already exist use it. */ - return colStats; + try { + return colStats.clone(); + } catch (CloneNotSupportedException e) { + return null; + } } // virtual columns diff --git ql/src/test/queries/clientpositive/join_max_hashtable.q ql/src/test/queries/clientpositive/join_max_hashtable.q new file mode 100644 index 0000000..8854922 --- /dev/null +++ ql/src/test/queries/clientpositive/join_max_hashtable.q @@ -0,0 +1,36 @@ +set hive.auto.convert.join=true; +set hive.auto.convert.join.hashtable.max.entries=500; + +-- CONVERT +EXPLAIN +SELECT x.key, x.value +FROM src x JOIN src y ON (x.key = y.key); + +-- CONVERT +EXPLAIN +SELECT x.key, x.value +FROM src x JOIN src y ON (x.key = y.key AND x.value = y.value); + +set hive.auto.convert.join.hashtable.max.entries=300; + +-- CONVERT +EXPLAIN +SELECT x.key, x.value +FROM src x JOIN src y ON (x.key = y.key); + +-- DO NOT CONVERT +EXPLAIN +SELECT x.key, x.value +FROM src x JOIN src y ON (x.key = y.key AND x.value = y.value); + +set hive.auto.convert.join.hashtable.max.entries=10; + +-- DO NOT CONVERT +EXPLAIN +SELECT x.key, x.value +FROM src x JOIN src y ON (x.key = y.key); + +-- DO NOT CONVERT +EXPLAIN +SELECT x.key, x.value +FROM src x JOIN src y ON (x.key = y.key AND x.value = y.value); diff --git ql/src/test/queries/clientpositive/tez_dynpart_hashjoin_1.q ql/src/test/queries/clientpositive/tez_dynpart_hashjoin_1.q index aa331f2..d129305 100644 --- ql/src/test/queries/clientpositive/tez_dynpart_hashjoin_1.q +++ ql/src/test/queries/clientpositive/tez_dynpart_hashjoin_1.q @@ -52,10 +52,57 @@ order by c1; set hive.auto.convert.join=true; set hive.optimize.dynamic.partition.hashjoin=true; -set hive.auto.convert.join.noconditionaltask.size=20000; +set hive.auto.convert.join.noconditionaltask.size=300000; +-- Try with mapjoin +explain +select + * +from alltypesorc a join alltypesorc b on a.cint = b.cint +where + a.cint between 1000000 and 3000000 and b.cbigint is not null +order by a.cint; + +select + * +from alltypesorc a join alltypesorc b on a.cint = b.cint +where + a.cint between 1000000 and 3000000 and b.cbigint is not null +order by a.cint; + +explain +select + count(*) +from alltypesorc a join alltypesorc b on a.cint = b.cint +where + a.cint between 1000000 and 3000000 and b.cbigint is not null; + +select + count(*) +from alltypesorc a join alltypesorc b on a.cint = b.cint +where + a.cint between 1000000 and 3000000 and b.cbigint is not null; + +explain +select + a.csmallint cs, count(*) c1 +from alltypesorc a join alltypesorc b on a.cint = b.cint +where + a.cint between 1000000 and 3000000 and b.cbigint is not null +group by a.csmallint +order by cs; + +select + a.csmallint cs, count(*) c1 +from alltypesorc a join alltypesorc b on a.cint = b.cint +where + a.cint between 1000000 and 3000000 and b.cbigint is not null +group by a.csmallint +order by cs; + set hive.exec.reducers.bytes.per.reducer=20000; -set hive.stats.fetch.column.stats=false; --- Try with dynamically partitioned hashjoin +set hive.auto.convert.join.hashtable.max.entries=400; +-- Set max number of entries for hashtable +-- Cannot convert to map join anymore, using dynamically partitioned hashjoin instead explain select * @@ -100,3 +147,4 @@ where a.cint between 1000000 and 3000000 and b.cbigint is not null group by a.csmallint order by cs; + diff --git ql/src/test/results/clientpositive/llap/join_max_hashtable.q.out ql/src/test/results/clientpositive/llap/join_max_hashtable.q.out new file mode 100644 index 0000000..f0df4f0 --- /dev/null +++ ql/src/test/results/clientpositive/llap/join_max_hashtable.q.out @@ -0,0 +1,481 @@ +PREHOOK: query: EXPLAIN +SELECT x.key, x.value +FROM src x JOIN src y ON (x.key = y.key) +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN +SELECT x.key, x.value +FROM src x JOIN src y ON (x.key = y.key) +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Map 1 <- Map 2 (BROADCAST_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: x + Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: key is not null (type: boolean) + Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: key (type: string), value (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE + Map Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 _col0 (type: string) + 1 _col0 (type: string) + outputColumnNames: _col0, _col1 + input vertices: + 1 Map 2 + Statistics: Num rows: 1219 Data size: 216982 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 1219 Data size: 216982 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Execution mode: llap + LLAP IO: no inputs + Map 2 + Map Operator Tree: + TableScan + alias: y + Statistics: Num rows: 500 Data size: 43500 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: key is not null (type: boolean) + Statistics: Num rows: 500 Data size: 43500 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: key (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 500 Data size: 43500 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 500 Data size: 43500 Basic stats: COMPLETE Column stats: COMPLETE + Execution mode: llap + LLAP IO: no inputs + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: EXPLAIN +SELECT x.key, x.value +FROM src x JOIN src y ON (x.key = y.key AND x.value = y.value) +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN +SELECT x.key, x.value +FROM src x JOIN src y ON (x.key = y.key AND x.value = y.value) +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Map 1 <- Map 2 (BROADCAST_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: x + Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: (key is not null and value is not null) (type: boolean) + Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: key (type: string), value (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE + Map Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 _col0 (type: string), _col1 (type: string) + 1 _col0 (type: string), _col1 (type: string) + outputColumnNames: _col0, _col1 + input vertices: + 1 Map 2 + Statistics: Num rows: 5 Data size: 890 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 5 Data size: 890 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Execution mode: llap + LLAP IO: no inputs + Map 2 + Map Operator Tree: + TableScan + alias: y + Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: (key is not null and value is not null) (type: boolean) + Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: key (type: string), value (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: string), _col1 (type: string) + sort order: ++ + Map-reduce partition columns: _col0 (type: string), _col1 (type: string) + Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE + Execution mode: llap + LLAP IO: no inputs + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: EXPLAIN +SELECT x.key, x.value +FROM src x JOIN src y ON (x.key = y.key) +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN +SELECT x.key, x.value +FROM src x JOIN src y ON (x.key = y.key) +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Map 1 <- Map 2 (BROADCAST_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: x + Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: key is not null (type: boolean) + Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: key (type: string), value (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE + Map Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 _col0 (type: string) + 1 _col0 (type: string) + outputColumnNames: _col0, _col1 + input vertices: + 1 Map 2 + Statistics: Num rows: 1219 Data size: 216982 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 1219 Data size: 216982 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Execution mode: llap + LLAP IO: no inputs + Map 2 + Map Operator Tree: + TableScan + alias: y + Statistics: Num rows: 500 Data size: 43500 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: key is not null (type: boolean) + Statistics: Num rows: 500 Data size: 43500 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: key (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 500 Data size: 43500 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 500 Data size: 43500 Basic stats: COMPLETE Column stats: COMPLETE + Execution mode: llap + LLAP IO: no inputs + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: EXPLAIN +SELECT x.key, x.value +FROM src x JOIN src y ON (x.key = y.key AND x.value = y.value) +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN +SELECT x.key, x.value +FROM src x JOIN src y ON (x.key = y.key AND x.value = y.value) +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 3 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: x + Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: (key is not null and value is not null) (type: boolean) + Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: key (type: string), value (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: string), _col1 (type: string) + sort order: ++ + Map-reduce partition columns: _col0 (type: string), _col1 (type: string) + Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE + Execution mode: llap + LLAP IO: no inputs + Map 3 + Map Operator Tree: + TableScan + alias: y + Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: (key is not null and value is not null) (type: boolean) + Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: key (type: string), value (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: string), _col1 (type: string) + sort order: ++ + Map-reduce partition columns: _col0 (type: string), _col1 (type: string) + Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE + Execution mode: llap + LLAP IO: no inputs + Reducer 2 + Execution mode: llap + Reduce Operator Tree: + Merge Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 _col0 (type: string), _col1 (type: string) + 1 _col0 (type: string), _col1 (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 5 Data size: 890 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 5 Data size: 890 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: EXPLAIN +SELECT x.key, x.value +FROM src x JOIN src y ON (x.key = y.key) +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN +SELECT x.key, x.value +FROM src x JOIN src y ON (x.key = y.key) +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 3 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: x + Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: key is not null (type: boolean) + Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: key (type: string), value (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: string) + Execution mode: llap + LLAP IO: no inputs + Map 3 + Map Operator Tree: + TableScan + alias: y + Statistics: Num rows: 500 Data size: 43500 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: key is not null (type: boolean) + Statistics: Num rows: 500 Data size: 43500 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: key (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 500 Data size: 43500 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 500 Data size: 43500 Basic stats: COMPLETE Column stats: COMPLETE + Execution mode: llap + LLAP IO: no inputs + Reducer 2 + Execution mode: llap + Reduce Operator Tree: + Merge Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 _col0 (type: string) + 1 _col0 (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1219 Data size: 216982 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 1219 Data size: 216982 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: EXPLAIN +SELECT x.key, x.value +FROM src x JOIN src y ON (x.key = y.key AND x.value = y.value) +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN +SELECT x.key, x.value +FROM src x JOIN src y ON (x.key = y.key AND x.value = y.value) +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 3 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: x + Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: (key is not null and value is not null) (type: boolean) + Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: key (type: string), value (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: string), _col1 (type: string) + sort order: ++ + Map-reduce partition columns: _col0 (type: string), _col1 (type: string) + Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE + Execution mode: llap + LLAP IO: no inputs + Map 3 + Map Operator Tree: + TableScan + alias: y + Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: (key is not null and value is not null) (type: boolean) + Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: key (type: string), value (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: string), _col1 (type: string) + sort order: ++ + Map-reduce partition columns: _col0 (type: string), _col1 (type: string) + Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE + Execution mode: llap + LLAP IO: no inputs + Reducer 2 + Execution mode: llap + Reduce Operator Tree: + Merge Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 _col0 (type: string), _col1 (type: string) + 1 _col0 (type: string), _col1 (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 5 Data size: 890 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 5 Data size: 890 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + diff --git ql/src/test/results/clientpositive/llap/tez_dynpart_hashjoin_1.q.out ql/src/test/results/clientpositive/llap/tez_dynpart_hashjoin_1.q.out index 33350a7..2beca4d 100644 --- ql/src/test/results/clientpositive/llap/tez_dynpart_hashjoin_1.q.out +++ ql/src/test/results/clientpositive/llap/tez_dynpart_hashjoin_1.q.out @@ -434,6 +434,397 @@ STAGE PLANS: Tez #### A masked pattern was here #### Edges: + Map 1 <- Map 3 (BROADCAST_EDGE) + Reducer 2 <- Map 1 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: a + Statistics: Num rows: 12288 Data size: 3093170 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: cint BETWEEN 1000000 AND 3000000 (type: boolean) + Statistics: Num rows: 1365 Data size: 343800 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: ctinyint (type: tinyint), csmallint (type: smallint), cint (type: int), cbigint (type: bigint), cfloat (type: float), cdouble (type: double), cstring1 (type: string), cstring2 (type: string), ctimestamp1 (type: timestamp), ctimestamp2 (type: timestamp), cboolean1 (type: boolean), cboolean2 (type: boolean) + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11 + Statistics: Num rows: 1365 Data size: 343800 Basic stats: COMPLETE Column stats: COMPLETE + Map Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 _col2 (type: int) + 1 _col2 (type: int) + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13, _col14, _col15, _col16, _col17, _col18, _col19, _col20, _col21, _col22, _col23 + input vertices: + 1 Map 3 + Statistics: Num rows: 2166 Data size: 1342920 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col2 (type: int) + sort order: + + Statistics: Num rows: 2166 Data size: 1342920 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: tinyint), _col1 (type: smallint), _col3 (type: bigint), _col4 (type: float), _col5 (type: double), _col6 (type: string), _col7 (type: string), _col8 (type: timestamp), _col9 (type: timestamp), _col10 (type: boolean), _col11 (type: boolean), _col12 (type: tinyint), _col13 (type: smallint), _col14 (type: int), _col15 (type: bigint), _col16 (type: float), _col17 (type: double), _col18 (type: string), _col19 (type: string), _col20 (type: timestamp), _col21 (type: timestamp), _col22 (type: boolean), _col23 (type: boolean) + Execution mode: llap + LLAP IO: all inputs + Map 3 + Map Operator Tree: + TableScan + alias: b + Statistics: Num rows: 12288 Data size: 3093170 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: (cint BETWEEN 1000000 AND 3000000 and cbigint is not null) (type: boolean) + Statistics: Num rows: 1019 Data size: 256780 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: ctinyint (type: tinyint), csmallint (type: smallint), cint (type: int), cbigint (type: bigint), cfloat (type: float), cdouble (type: double), cstring1 (type: string), cstring2 (type: string), ctimestamp1 (type: timestamp), ctimestamp2 (type: timestamp), cboolean1 (type: boolean), cboolean2 (type: boolean) + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11 + Statistics: Num rows: 1019 Data size: 256780 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col2 (type: int) + sort order: + + Map-reduce partition columns: _col2 (type: int) + Statistics: Num rows: 1019 Data size: 256780 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: tinyint), _col1 (type: smallint), _col3 (type: bigint), _col4 (type: float), _col5 (type: double), _col6 (type: string), _col7 (type: string), _col8 (type: timestamp), _col9 (type: timestamp), _col10 (type: boolean), _col11 (type: boolean) + Execution mode: llap + LLAP IO: all inputs + Reducer 2 + Execution mode: llap + Reduce Operator Tree: + Select Operator + expressions: VALUE._col0 (type: tinyint), VALUE._col1 (type: smallint), KEY.reducesinkkey0 (type: int), VALUE._col2 (type: bigint), VALUE._col3 (type: float), VALUE._col4 (type: double), VALUE._col5 (type: string), VALUE._col6 (type: string), VALUE._col7 (type: timestamp), VALUE._col8 (type: timestamp), VALUE._col9 (type: boolean), VALUE._col10 (type: boolean), VALUE._col11 (type: tinyint), VALUE._col12 (type: smallint), VALUE._col13 (type: int), VALUE._col14 (type: bigint), VALUE._col15 (type: float), VALUE._col16 (type: double), VALUE._col17 (type: string), VALUE._col18 (type: string), VALUE._col19 (type: timestamp), VALUE._col20 (type: timestamp), VALUE._col21 (type: boolean), VALUE._col22 (type: boolean) + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13, _col14, _col15, _col16, _col17, _col18, _col19, _col20, _col21, _col22, _col23 + Statistics: Num rows: 2166 Data size: 1342920 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 2166 Data size: 1342920 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select + * +from alltypesorc a join alltypesorc b on a.cint = b.cint +where + a.cint between 1000000 and 3000000 and b.cbigint is not null +order by a.cint +PREHOOK: type: QUERY +PREHOOK: Input: default@alltypesorc +#### A masked pattern was here #### +POSTHOOK: query: select + * +from alltypesorc a join alltypesorc b on a.cint = b.cint +where + a.cint between 1000000 and 3000000 and b.cbigint is not null +order by a.cint +POSTHOOK: type: QUERY +POSTHOOK: Input: default@alltypesorc +#### A masked pattern was here #### +11 NULL 1000828 1531084669 11.0 NULL wM316f6NqGIkoP388j3F6 poWQQo3Upvt3Wh 1969-12-31 16:00:02.351 NULL false true 11 NULL 1000828 1531084669 11.0 NULL wM316f6NqGIkoP388j3F6 poWQQo3Upvt3Wh 1969-12-31 16:00:02.351 NULL false true +NULL -3799 1248059 1864027286 NULL -3799.0 Uhps6mMh3IfHB3j7yH62K 4KWs6gw7lv2WYd66P NULL 1969-12-31 15:59:54.622 false true NULL -3799 1248059 1864027286 NULL -3799.0 Uhps6mMh3IfHB3j7yH62K 4KWs6gw7lv2WYd66P NULL 1969-12-31 15:59:54.622 false true +NULL 10782 1286921 1864027286 NULL 10782.0 ODLrXI8882q8LS8 4KWs6gw7lv2WYd66P NULL 1969-12-31 15:59:52.138 true true NULL 10782 1286921 1864027286 NULL 10782.0 ODLrXI8882q8LS8 4KWs6gw7lv2WYd66P NULL 1969-12-31 15:59:52.138 true true +NULL -13036 1288927 -1645852809 NULL -13036.0 yinBY725P7V2 xH7445Rals48VOulSyR5F NULL 1969-12-31 16:00:00.763 true false NULL -13036 1288927 -1645852809 NULL -13036.0 yinBY725P7V2 xH7445Rals48VOulSyR5F NULL 1969-12-31 16:00:00.763 true false +11 NULL 1310786 -413875656 11.0 NULL W0rvA4H1xn0xMG4uk0 8yVVjG 1969-12-31 16:00:02.351 NULL false true 11 NULL 1310786 -413875656 11.0 NULL W0rvA4H1xn0xMG4uk0 8yVVjG 1969-12-31 16:00:02.351 NULL false true +-51 NULL 2089466 -240556350 -51.0 NULL cXX24dH7tblSj46j2g C31eea0wrHHqvj 1969-12-31 16:00:08.451 NULL true true -51 NULL 2089466 -240556350 -51.0 NULL cXX24dH7tblSj46j2g C31eea0wrHHqvj 1969-12-31 16:00:08.451 NULL true true +NULL -8915 2101183 1864027286 NULL -8915.0 x7By66525 4KWs6gw7lv2WYd66P NULL 1969-12-31 16:00:05.831 false true NULL -8915 2101183 1864027286 NULL -8915.0 x7By66525 4KWs6gw7lv2WYd66P NULL 1969-12-31 16:00:05.831 false true +8 NULL 2229621 -381406148 8.0 NULL q7onkS7QRPh5ghOK oKb0bi 1969-12-31 16:00:15.892 NULL true false 8 NULL 2229621 -381406148 8.0 NULL q7onkS7QRPh5ghOK oKb0bi 1969-12-31 16:00:15.892 NULL true false +8 NULL 2433892 -1611863517 8.0 NULL 674ILv3V2TxFqXP6wSbL VLprkK2XfX 1969-12-31 16:00:15.892 NULL false true 8 NULL 2433892 -1611863517 8.0 NULL 674ILv3V2TxFqXP6wSbL VLprkK2XfX 1969-12-31 16:00:15.892 NULL false true +-51 NULL 2949963 -1580871111 -51.0 NULL 0K68k3bdl7jO7 TPPAu 1969-12-31 16:00:08.451 NULL true false -51 NULL 2949963 -1580871111 -51.0 NULL 0K68k3bdl7jO7 TPPAu 1969-12-31 16:00:08.451 NULL true false +PREHOOK: query: explain +select + count(*) +from alltypesorc a join alltypesorc b on a.cint = b.cint +where + a.cint between 1000000 and 3000000 and b.cbigint is not null +PREHOOK: type: QUERY +POSTHOOK: query: explain +select + count(*) +from alltypesorc a join alltypesorc b on a.cint = b.cint +where + a.cint between 1000000 and 3000000 and b.cbigint is not null +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Map 1 <- Map 3 (BROADCAST_EDGE) + Reducer 2 <- Map 1 (CUSTOM_SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: a + Statistics: Num rows: 12288 Data size: 36696 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: cint BETWEEN 1000000 AND 3000000 (type: boolean) + Statistics: Num rows: 1365 Data size: 4080 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: cint (type: int) + outputColumnNames: _col0 + Statistics: Num rows: 1365 Data size: 4080 Basic stats: COMPLETE Column stats: COMPLETE + Map Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 _col0 (type: int) + 1 _col0 (type: int) + input vertices: + 1 Map 3 + Statistics: Num rows: 2166 Data size: 17328 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: count() + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: bigint) + Execution mode: llap + LLAP IO: all inputs + Map 3 + Map Operator Tree: + TableScan + alias: b + Statistics: Num rows: 12288 Data size: 110088 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: (cint BETWEEN 1000000 AND 3000000 and cbigint is not null) (type: boolean) + Statistics: Num rows: 1019 Data size: 9144 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: cint (type: int) + outputColumnNames: _col0 + Statistics: Num rows: 1019 Data size: 9144 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: int) + sort order: + + Map-reduce partition columns: _col0 (type: int) + Statistics: Num rows: 1019 Data size: 9144 Basic stats: COMPLETE Column stats: COMPLETE + Execution mode: llap + LLAP IO: all inputs + Reducer 2 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select + count(*) +from alltypesorc a join alltypesorc b on a.cint = b.cint +where + a.cint between 1000000 and 3000000 and b.cbigint is not null +PREHOOK: type: QUERY +PREHOOK: Input: default@alltypesorc +#### A masked pattern was here #### +POSTHOOK: query: select + count(*) +from alltypesorc a join alltypesorc b on a.cint = b.cint +where + a.cint between 1000000 and 3000000 and b.cbigint is not null +POSTHOOK: type: QUERY +POSTHOOK: Input: default@alltypesorc +#### A masked pattern was here #### +10 +PREHOOK: query: explain +select + a.csmallint cs, count(*) c1 +from alltypesorc a join alltypesorc b on a.cint = b.cint +where + a.cint between 1000000 and 3000000 and b.cbigint is not null +group by a.csmallint +order by cs +PREHOOK: type: QUERY +POSTHOOK: query: explain +select + a.csmallint cs, count(*) c1 +from alltypesorc a join alltypesorc b on a.cint = b.cint +where + a.cint between 1000000 and 3000000 and b.cbigint is not null +group by a.csmallint +order by cs +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Map 1 <- Map 4 (BROADCAST_EDGE) + Reducer 2 <- Map 1 (SIMPLE_EDGE) + Reducer 3 <- Reducer 2 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: a + Statistics: Num rows: 12288 Data size: 73396 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: cint BETWEEN 1000000 AND 3000000 (type: boolean) + Statistics: Num rows: 1365 Data size: 8160 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: csmallint (type: smallint), cint (type: int) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1365 Data size: 8160 Basic stats: COMPLETE Column stats: COMPLETE + Map Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 _col1 (type: int) + 1 _col0 (type: int) + outputColumnNames: _col0 + input vertices: + 1 Map 4 + Statistics: Num rows: 2166 Data size: 8664 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: count() + keys: _col0 (type: smallint) + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 615 Data size: 7380 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: smallint) + sort order: + + Map-reduce partition columns: _col0 (type: smallint) + Statistics: Num rows: 615 Data size: 7380 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: bigint) + Execution mode: llap + LLAP IO: all inputs + Map 4 + Map Operator Tree: + TableScan + alias: b + Statistics: Num rows: 12288 Data size: 110088 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: (cint BETWEEN 1000000 AND 3000000 and cbigint is not null) (type: boolean) + Statistics: Num rows: 1019 Data size: 9144 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: cint (type: int) + outputColumnNames: _col0 + Statistics: Num rows: 1019 Data size: 9144 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: int) + sort order: + + Map-reduce partition columns: _col0 (type: int) + Statistics: Num rows: 1019 Data size: 9144 Basic stats: COMPLETE Column stats: COMPLETE + Execution mode: llap + LLAP IO: all inputs + Reducer 2 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + keys: KEY._col0 (type: smallint) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 615 Data size: 7380 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: smallint) + sort order: + + Statistics: Num rows: 615 Data size: 7380 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: bigint) + Reducer 3 + Execution mode: llap + Reduce Operator Tree: + Select Operator + expressions: KEY.reducesinkkey0 (type: smallint), VALUE._col0 (type: bigint) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 615 Data size: 7380 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 615 Data size: 7380 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select + a.csmallint cs, count(*) c1 +from alltypesorc a join alltypesorc b on a.cint = b.cint +where + a.cint between 1000000 and 3000000 and b.cbigint is not null +group by a.csmallint +order by cs +PREHOOK: type: QUERY +PREHOOK: Input: default@alltypesorc +#### A masked pattern was here #### +POSTHOOK: query: select + a.csmallint cs, count(*) c1 +from alltypesorc a join alltypesorc b on a.cint = b.cint +where + a.cint between 1000000 and 3000000 and b.cbigint is not null +group by a.csmallint +order by cs +POSTHOOK: type: QUERY +POSTHOOK: Input: default@alltypesorc +#### A masked pattern was here #### +NULL 6 +-13036 1 +-8915 1 +-3799 1 +10782 1 +PREHOOK: query: explain +select + * +from alltypesorc a join alltypesorc b on a.cint = b.cint +where + a.cint between 1000000 and 3000000 and b.cbigint is not null +order by a.cint +PREHOOK: type: QUERY +POSTHOOK: query: explain +select + * +from alltypesorc a join alltypesorc b on a.cint = b.cint +where + a.cint between 1000000 and 3000000 and b.cbigint is not null +order by a.cint +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: Reducer 2 <- Map 1 (CUSTOM_SIMPLE_EDGE), Map 4 (CUSTOM_SIMPLE_EDGE) Reducer 3 <- Reducer 2 (SIMPLE_EDGE) #### A masked pattern was here #### @@ -442,19 +833,19 @@ STAGE PLANS: Map Operator Tree: TableScan alias: a - Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 12288 Data size: 3093170 Basic stats: COMPLETE Column stats: COMPLETE Filter Operator predicate: cint BETWEEN 1000000 AND 3000000 (type: boolean) - Statistics: Num rows: 1365 Data size: 293479 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 1365 Data size: 343800 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: ctinyint (type: tinyint), csmallint (type: smallint), cint (type: int), cbigint (type: bigint), cfloat (type: float), cdouble (type: double), cstring1 (type: string), cstring2 (type: string), ctimestamp1 (type: timestamp), ctimestamp2 (type: timestamp), cboolean1 (type: boolean), cboolean2 (type: boolean) outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11 - Statistics: Num rows: 1365 Data size: 293479 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 1365 Data size: 343800 Basic stats: COMPLETE Column stats: COMPLETE Reduce Output Operator key expressions: _col2 (type: int) sort order: + Map-reduce partition columns: _col2 (type: int) - Statistics: Num rows: 1365 Data size: 293479 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 1365 Data size: 343800 Basic stats: COMPLETE Column stats: COMPLETE value expressions: _col0 (type: tinyint), _col1 (type: smallint), _col3 (type: bigint), _col4 (type: float), _col5 (type: double), _col6 (type: string), _col7 (type: string), _col8 (type: timestamp), _col9 (type: timestamp), _col10 (type: boolean), _col11 (type: boolean) Execution mode: llap LLAP IO: all inputs @@ -462,19 +853,19 @@ STAGE PLANS: Map Operator Tree: TableScan alias: b - Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 12288 Data size: 3093170 Basic stats: COMPLETE Column stats: COMPLETE Filter Operator predicate: (cint BETWEEN 1000000 AND 3000000 and cbigint is not null) (type: boolean) - Statistics: Num rows: 1365 Data size: 293479 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 1019 Data size: 256780 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: ctinyint (type: tinyint), csmallint (type: smallint), cint (type: int), cbigint (type: bigint), cfloat (type: float), cdouble (type: double), cstring1 (type: string), cstring2 (type: string), ctimestamp1 (type: timestamp), ctimestamp2 (type: timestamp), cboolean1 (type: boolean), cboolean2 (type: boolean) outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11 - Statistics: Num rows: 1365 Data size: 293479 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 1019 Data size: 256780 Basic stats: COMPLETE Column stats: COMPLETE Reduce Output Operator key expressions: _col2 (type: int) sort order: + Map-reduce partition columns: _col2 (type: int) - Statistics: Num rows: 1365 Data size: 293479 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 1019 Data size: 256780 Basic stats: COMPLETE Column stats: COMPLETE value expressions: _col0 (type: tinyint), _col1 (type: smallint), _col3 (type: bigint), _col4 (type: float), _col5 (type: double), _col6 (type: string), _col7 (type: string), _col8 (type: timestamp), _col9 (type: timestamp), _col10 (type: boolean), _col11 (type: boolean) Execution mode: llap LLAP IO: all inputs @@ -490,12 +881,12 @@ STAGE PLANS: outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13, _col14, _col15, _col16, _col17, _col18, _col19, _col20, _col21, _col22, _col23 input vertices: 1 Map 4 - Statistics: Num rows: 1501 Data size: 322826 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 2166 Data size: 1342920 Basic stats: COMPLETE Column stats: COMPLETE HybridGraceHashJoin: true Reduce Output Operator key expressions: _col2 (type: int) sort order: + - Statistics: Num rows: 1501 Data size: 322826 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 2166 Data size: 1342920 Basic stats: COMPLETE Column stats: COMPLETE value expressions: _col0 (type: tinyint), _col1 (type: smallint), _col3 (type: bigint), _col4 (type: float), _col5 (type: double), _col6 (type: string), _col7 (type: string), _col8 (type: timestamp), _col9 (type: timestamp), _col10 (type: boolean), _col11 (type: boolean), _col12 (type: tinyint), _col13 (type: smallint), _col14 (type: int), _col15 (type: bigint), _col16 (type: float), _col17 (type: double), _col18 (type: string), _col19 (type: string), _col20 (type: timestamp), _col21 (type: timestamp), _col22 (type: boolean), _col23 (type: boolean) Reducer 3 Execution mode: llap @@ -503,10 +894,10 @@ STAGE PLANS: Select Operator expressions: VALUE._col0 (type: tinyint), VALUE._col1 (type: smallint), KEY.reducesinkkey0 (type: int), VALUE._col2 (type: bigint), VALUE._col3 (type: float), VALUE._col4 (type: double), VALUE._col5 (type: string), VALUE._col6 (type: string), VALUE._col7 (type: timestamp), VALUE._col8 (type: timestamp), VALUE._col9 (type: boolean), VALUE._col10 (type: boolean), VALUE._col11 (type: tinyint), VALUE._col12 (type: smallint), VALUE._col13 (type: int), VALUE._col14 (type: bigint), VALUE._col15 (type: float), VALUE._col16 (type: double), VALUE._col17 (type: string), VALUE._col18 (type: string), VALUE._col19 (type: timestamp), VALUE._col20 (type: timestamp), VALUE._col21 (type: boolean), VALUE._col22 (type: boolean) outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13, _col14, _col15, _col16, _col17, _col18, _col19, _col20, _col21, _col22, _col23 - Statistics: Num rows: 1501 Data size: 322826 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 2166 Data size: 1342920 Basic stats: COMPLETE Column stats: COMPLETE File Output Operator compressed: false - Statistics: Num rows: 1501 Data size: 322826 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 2166 Data size: 1342920 Basic stats: COMPLETE Column stats: COMPLETE table: input format: org.apache.hadoop.mapred.SequenceFileInputFormat output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat @@ -577,38 +968,38 @@ STAGE PLANS: Map Operator Tree: TableScan alias: a - Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 12288 Data size: 36696 Basic stats: COMPLETE Column stats: COMPLETE Filter Operator predicate: cint BETWEEN 1000000 AND 3000000 (type: boolean) - Statistics: Num rows: 1365 Data size: 293479 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 1365 Data size: 4080 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: cint (type: int) outputColumnNames: _col0 - Statistics: Num rows: 1365 Data size: 293479 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 1365 Data size: 4080 Basic stats: COMPLETE Column stats: COMPLETE Reduce Output Operator key expressions: _col0 (type: int) sort order: + Map-reduce partition columns: _col0 (type: int) - Statistics: Num rows: 1365 Data size: 293479 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 1365 Data size: 4080 Basic stats: COMPLETE Column stats: COMPLETE Execution mode: llap LLAP IO: all inputs Map 4 Map Operator Tree: TableScan alias: b - Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 12288 Data size: 110088 Basic stats: COMPLETE Column stats: COMPLETE Filter Operator predicate: (cint BETWEEN 1000000 AND 3000000 and cbigint is not null) (type: boolean) - Statistics: Num rows: 1365 Data size: 293479 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 1019 Data size: 9144 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: cint (type: int) outputColumnNames: _col0 - Statistics: Num rows: 1365 Data size: 293479 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 1019 Data size: 9144 Basic stats: COMPLETE Column stats: COMPLETE Reduce Output Operator key expressions: _col0 (type: int) sort order: + Map-reduce partition columns: _col0 (type: int) - Statistics: Num rows: 1365 Data size: 293479 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 1019 Data size: 9144 Basic stats: COMPLETE Column stats: COMPLETE Execution mode: llap LLAP IO: all inputs Reducer 2 @@ -622,16 +1013,16 @@ STAGE PLANS: 1 KEY.reducesinkkey0 (type: int) input vertices: 1 Map 4 - Statistics: Num rows: 1501 Data size: 322826 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 2166 Data size: 17328 Basic stats: COMPLETE Column stats: COMPLETE HybridGraceHashJoin: true Group By Operator aggregations: count() mode: hash outputColumnNames: _col0 - Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE Reduce Output Operator sort order: - Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE value expressions: _col0 (type: bigint) Reducer 3 Execution mode: llap @@ -640,10 +1031,10 @@ STAGE PLANS: aggregations: count(VALUE._col0) mode: mergepartial outputColumnNames: _col0 - Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE File Output Operator compressed: false - Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE table: input format: org.apache.hadoop.mapred.SequenceFileInputFormat output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat @@ -708,19 +1099,19 @@ STAGE PLANS: Map Operator Tree: TableScan alias: a - Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 12288 Data size: 73396 Basic stats: COMPLETE Column stats: COMPLETE Filter Operator predicate: cint BETWEEN 1000000 AND 3000000 (type: boolean) - Statistics: Num rows: 1365 Data size: 293479 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 1365 Data size: 8160 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: csmallint (type: smallint), cint (type: int) outputColumnNames: _col0, _col1 - Statistics: Num rows: 1365 Data size: 293479 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 1365 Data size: 8160 Basic stats: COMPLETE Column stats: COMPLETE Reduce Output Operator key expressions: _col1 (type: int) sort order: + Map-reduce partition columns: _col1 (type: int) - Statistics: Num rows: 1365 Data size: 293479 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 1365 Data size: 8160 Basic stats: COMPLETE Column stats: COMPLETE value expressions: _col0 (type: smallint) Execution mode: llap LLAP IO: all inputs @@ -728,19 +1119,19 @@ STAGE PLANS: Map Operator Tree: TableScan alias: b - Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 12288 Data size: 110088 Basic stats: COMPLETE Column stats: COMPLETE Filter Operator predicate: (cint BETWEEN 1000000 AND 3000000 and cbigint is not null) (type: boolean) - Statistics: Num rows: 1365 Data size: 293479 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 1019 Data size: 9144 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: cint (type: int) outputColumnNames: _col0 - Statistics: Num rows: 1365 Data size: 293479 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 1019 Data size: 9144 Basic stats: COMPLETE Column stats: COMPLETE Reduce Output Operator key expressions: _col0 (type: int) sort order: + Map-reduce partition columns: _col0 (type: int) - Statistics: Num rows: 1365 Data size: 293479 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 1019 Data size: 9144 Basic stats: COMPLETE Column stats: COMPLETE Execution mode: llap LLAP IO: all inputs Reducer 2 @@ -755,19 +1146,19 @@ STAGE PLANS: outputColumnNames: _col0 input vertices: 1 Map 5 - Statistics: Num rows: 1501 Data size: 322826 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 2166 Data size: 8664 Basic stats: COMPLETE Column stats: COMPLETE HybridGraceHashJoin: true Group By Operator aggregations: count() keys: _col0 (type: smallint) mode: hash outputColumnNames: _col0, _col1 - Statistics: Num rows: 1501 Data size: 322826 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 615 Data size: 7380 Basic stats: COMPLETE Column stats: COMPLETE Reduce Output Operator key expressions: _col0 (type: smallint) sort order: + Map-reduce partition columns: _col0 (type: smallint) - Statistics: Num rows: 1501 Data size: 322826 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 615 Data size: 7380 Basic stats: COMPLETE Column stats: COMPLETE value expressions: _col1 (type: bigint) Reducer 3 Execution mode: llap @@ -777,11 +1168,11 @@ STAGE PLANS: keys: KEY._col0 (type: smallint) mode: mergepartial outputColumnNames: _col0, _col1 - Statistics: Num rows: 750 Data size: 161305 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 615 Data size: 7380 Basic stats: COMPLETE Column stats: COMPLETE Reduce Output Operator key expressions: _col0 (type: smallint) sort order: + - Statistics: Num rows: 750 Data size: 161305 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 615 Data size: 7380 Basic stats: COMPLETE Column stats: COMPLETE value expressions: _col1 (type: bigint) Reducer 4 Execution mode: llap @@ -789,10 +1180,10 @@ STAGE PLANS: Select Operator expressions: KEY.reducesinkkey0 (type: smallint), VALUE._col0 (type: bigint) outputColumnNames: _col0, _col1 - Statistics: Num rows: 750 Data size: 161305 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 615 Data size: 7380 Basic stats: COMPLETE Column stats: COMPLETE File Output Operator compressed: false - Statistics: Num rows: 750 Data size: 161305 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 615 Data size: 7380 Basic stats: COMPLETE Column stats: COMPLETE table: input format: org.apache.hadoop.mapred.SequenceFileInputFormat output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat