Index: ql/src/java/org/apache/hadoop/hive/ql/Driver.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/Driver.java (revision 1435017) +++ ql/src/java/org/apache/hadoop/hive/ql/Driver.java (working copy) @@ -1108,6 +1108,9 @@ // Add root Tasks to runnable for (Task tsk : plan.getRootTasks()) { + // This should never happen, if it does, it's a bug with the potential to produce + // incorrect results. + assert tsk.getParentTasks() == null || tsk.getParentTasks().isEmpty(); driverCxt.addToRunnable(tsk); } Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRUnion1.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRUnion1.java (revision 1435017) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRUnion1.java (working copy) @@ -177,7 +177,8 @@ currTask.addDependentTask(uTask); if (ctx.getRootTasks().contains(uTask)) { ctx.getRootTasks().remove(uTask); - if (!ctx.getRootTasks().contains(currTask)) { + if (!ctx.getRootTasks().contains(currTask) && + shouldBeRootTask(currTask)) { ctx.getRootTasks().add(currTask); } } @@ -290,7 +291,7 @@ // If it a map-reduce job, create a temporary file else { // is the current task a root task - if (shouldBeRootTask(currTask, parseCtx) + if (shouldBeRootTask(currTask) && (!ctx.getRootTasks().contains(currTask))) { ctx.getRootTasks().add(currTask); } @@ -315,7 +316,7 @@ } private boolean shouldBeRootTask( - Task currTask, ParseContext parseContext) { + Task currTask) { return currTask.getParentTasks() == null || (currTask.getParentTasks().size() == 0); } Index: ql/src/test/queries/clientpositive/union33.q =================================================================== --- ql/src/test/queries/clientpositive/union33.q (revision 0) +++ ql/src/test/queries/clientpositive/union33.q (working copy) @@ -0,0 +1,47 @@ +set hive.groupby.skewindata=true; + +-- This tests that a union all with a map only subquery on one side and a +-- subquery involving two map reduce jobs on the other runs correctly. + +CREATE TABLE test_src (key STRING, value STRING); + +EXPLAIN INSERT OVERWRITE TABLE test_src +SELECT key, value FROM ( + SELECT key, value FROM src + WHERE key = 0 +UNION ALL + SELECT key, COUNT(*) AS value FROM src + GROUP BY key +)a; + +INSERT OVERWRITE TABLE test_src +SELECT key, value FROM ( + SELECT key, value FROM src + WHERE key = 0 +UNION ALL + SELECT key, COUNT(*) AS value FROM src + GROUP BY key +)a; + +SELECT COUNT(*) FROM test_src; + +EXPLAIN INSERT OVERWRITE TABLE test_src +SELECT key, value FROM ( + SELECT key, COUNT(*) AS value FROM src + GROUP BY key +UNION ALL + SELECT key, value FROM src + WHERE key = 0 +)a; + +INSERT OVERWRITE TABLE test_src +SELECT key, value FROM ( + SELECT key, COUNT(*) AS value FROM src + GROUP BY key +UNION ALL + SELECT key, value FROM src + WHERE key = 0 +)a; + +SELECT COUNT(*) FROM test_src; + \ No newline at end of file Index: ql/src/test/results/clientpositive/union33.q.out =================================================================== --- ql/src/test/results/clientpositive/union33.q.out (revision 0) +++ ql/src/test/results/clientpositive/union33.q.out (working copy) @@ -0,0 +1,561 @@ +PREHOOK: query: -- This tests that a union all with a map only subquery on one side and a +-- subquery involving two map reduce jobs on the other runs correctly. + +CREATE TABLE test_src (key STRING, value STRING) +PREHOOK: type: CREATETABLE +POSTHOOK: query: -- This tests that a union all with a map only subquery on one side and a +-- subquery involving two map reduce jobs on the other runs correctly. + +CREATE TABLE test_src (key STRING, value STRING) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@test_src +PREHOOK: query: EXPLAIN INSERT OVERWRITE TABLE test_src +SELECT key, value FROM ( + SELECT key, value FROM src + WHERE key = 0 +UNION ALL + SELECT key, COUNT(*) AS value FROM src + GROUP BY key +)a +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN INSERT OVERWRITE TABLE test_src +SELECT key, value FROM ( + SELECT key, value FROM src + WHERE key = 0 +UNION ALL + SELECT key, COUNT(*) AS value FROM src + GROUP BY key +)a +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_UNION (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME src))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL value))) (TOK_WHERE (= (TOK_TABLE_OR_COL key) 0)))) (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME src))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTIONSTAR COUNT) value)) (TOK_GROUPBY (TOK_TABLE_OR_COL key))))) a)) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME test_src))) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL value))))) + +STAGE DEPENDENCIES: + Stage-9 is a root stage + Stage-10 depends on stages: Stage-9 + Stage-2 depends on stages: Stage-10 + Stage-8 depends on stages: Stage-2 , consists of Stage-5, Stage-4, Stage-6 + Stage-5 + Stage-0 depends on stages: Stage-5, Stage-4, Stage-7 + Stage-3 depends on stages: Stage-0 + Stage-4 + Stage-6 + Stage-7 depends on stages: Stage-6 + +STAGE PLANS: + Stage: Stage-9 + Map Reduce + Alias -> Map Operator Tree: + null-subquery2:a-subquery2:src + TableScan + alias: src + Select Operator + expressions: + expr: key + type: string + outputColumnNames: key + Group By Operator + aggregations: + expr: count() + bucketGroup: false + keys: + expr: key + type: string + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: rand() + type: double + tag: -1 + value expressions: + expr: _col1 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: string + mode: partials + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-10 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: -1 + value expressions: + expr: _col1 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: string + mode: final + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: UDFToString(_col1) + type: string + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + TableScan + Union + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: string + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.test_src + null-subquery1:a-subquery1:src + TableScan + alias: src + Filter Operator + predicate: + expr: (key = 0.0) + type: boolean + Select Operator + expressions: + expr: key + type: string + expr: value + type: string + outputColumnNames: _col0, _col1 + Union + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: string + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.test_src + + Stage: Stage-8 + Conditional Operator + + Stage: Stage-5 + Move Operator + files: + hdfs directory: true +#### A masked pattern was here #### + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.test_src + + Stage: Stage-3 + Stats-Aggr Operator + + Stage: Stage-4 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.test_src + + Stage: Stage-6 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.test_src + + Stage: Stage-7 + Move Operator + files: + hdfs directory: true +#### A masked pattern was here #### + + +PREHOOK: query: INSERT OVERWRITE TABLE test_src +SELECT key, value FROM ( + SELECT key, value FROM src + WHERE key = 0 +UNION ALL + SELECT key, COUNT(*) AS value FROM src + GROUP BY key +)a +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: default@test_src +POSTHOOK: query: INSERT OVERWRITE TABLE test_src +SELECT key, value FROM ( + SELECT key, value FROM src + WHERE key = 0 +UNION ALL + SELECT key, COUNT(*) AS value FROM src + GROUP BY key +)a +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: default@test_src +POSTHOOK: Lineage: test_src.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), (src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_src.value EXPRESSION [(src)src.FieldSchema(name:value, type:string, comment:default), (src)src.null, ] +PREHOOK: query: SELECT COUNT(*) FROM test_src +PREHOOK: type: QUERY +PREHOOK: Input: default@test_src +#### A masked pattern was here #### +POSTHOOK: query: SELECT COUNT(*) FROM test_src +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_src +#### A masked pattern was here #### +POSTHOOK: Lineage: test_src.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), (src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_src.value EXPRESSION [(src)src.FieldSchema(name:value, type:string, comment:default), (src)src.null, ] +312 +PREHOOK: query: EXPLAIN INSERT OVERWRITE TABLE test_src +SELECT key, value FROM ( + SELECT key, COUNT(*) AS value FROM src + GROUP BY key +UNION ALL + SELECT key, value FROM src + WHERE key = 0 +)a +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN INSERT OVERWRITE TABLE test_src +SELECT key, value FROM ( + SELECT key, COUNT(*) AS value FROM src + GROUP BY key +UNION ALL + SELECT key, value FROM src + WHERE key = 0 +)a +POSTHOOK: type: QUERY +POSTHOOK: Lineage: test_src.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), (src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_src.value EXPRESSION [(src)src.FieldSchema(name:value, type:string, comment:default), (src)src.null, ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_UNION (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME src))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTIONSTAR COUNT) value)) (TOK_GROUPBY (TOK_TABLE_OR_COL key)))) (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME src))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL value))) (TOK_WHERE (= (TOK_TABLE_OR_COL key) 0))))) a)) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME test_src))) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL value))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-3 depends on stages: Stage-2 + Stage-9 depends on stages: Stage-3 , consists of Stage-6, Stage-5, Stage-7 + Stage-6 + Stage-0 depends on stages: Stage-6, Stage-5, Stage-8 + Stage-4 depends on stages: Stage-0 + Stage-5 + Stage-7 + Stage-8 depends on stages: Stage-7 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + null-subquery1:a-subquery1:src + TableScan + alias: src + Select Operator + expressions: + expr: key + type: string + outputColumnNames: key + Group By Operator + aggregations: + expr: count() + bucketGroup: false + keys: + expr: key + type: string + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: rand() + type: double + tag: -1 + value expressions: + expr: _col1 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: string + mode: partials + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: -1 + value expressions: + expr: _col1 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: string + mode: final + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: UDFToString(_col1) + type: string + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-3 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + TableScan + Union + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: string + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.test_src + null-subquery2:a-subquery2:src + TableScan + alias: src + Filter Operator + predicate: + expr: (key = 0.0) + type: boolean + Select Operator + expressions: + expr: key + type: string + expr: value + type: string + outputColumnNames: _col0, _col1 + Union + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: string + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.test_src + + Stage: Stage-9 + Conditional Operator + + Stage: Stage-6 + Move Operator + files: + hdfs directory: true +#### A masked pattern was here #### + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.test_src + + Stage: Stage-4 + Stats-Aggr Operator + + Stage: Stage-5 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.test_src + + Stage: Stage-7 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.test_src + + Stage: Stage-8 + Move Operator + files: + hdfs directory: true +#### A masked pattern was here #### + + +PREHOOK: query: INSERT OVERWRITE TABLE test_src +SELECT key, value FROM ( + SELECT key, COUNT(*) AS value FROM src + GROUP BY key +UNION ALL + SELECT key, value FROM src + WHERE key = 0 +)a +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: default@test_src +POSTHOOK: query: INSERT OVERWRITE TABLE test_src +SELECT key, value FROM ( + SELECT key, COUNT(*) AS value FROM src + GROUP BY key +UNION ALL + SELECT key, value FROM src + WHERE key = 0 +)a +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: default@test_src +POSTHOOK: Lineage: test_src.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), (src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_src.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), (src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_src.value EXPRESSION [(src)src.FieldSchema(name:value, type:string, comment:default), (src)src.null, ] +POSTHOOK: Lineage: test_src.value EXPRESSION [(src)src.null, (src)src.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: SELECT COUNT(*) FROM test_src +PREHOOK: type: QUERY +PREHOOK: Input: default@test_src +#### A masked pattern was here #### +POSTHOOK: query: SELECT COUNT(*) FROM test_src +POSTHOOK: type: QUERY +POSTHOOK: Input: default@test_src +#### A masked pattern was here #### +POSTHOOK: Lineage: test_src.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), (src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_src.key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), (src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: test_src.value EXPRESSION [(src)src.FieldSchema(name:value, type:string, comment:default), (src)src.null, ] +POSTHOOK: Lineage: test_src.value EXPRESSION [(src)src.null, (src)src.FieldSchema(name:value, type:string, comment:default), ] +312