Index: common/src/java/org/apache/hadoop/hive/conf/HiveConf.java =================================================================== --- common/src/java/org/apache/hadoop/hive/conf/HiveConf.java (revision 1379405) +++ common/src/java/org/apache/hadoop/hive/conf/HiveConf.java (working copy) @@ -489,6 +489,10 @@ HIVEOPTBUCKETMAPJOIN("hive.optimize.bucketmapjoin", false), // optimize bucket map join HIVEOPTSORTMERGEBUCKETMAPJOIN("hive.optimize.bucketmapjoin.sortedmerge", false), // try to use sorted merge bucket map join HIVEOPTREDUCEDEDUPLICATION("hive.optimize.reducededuplication", true), + // whether to optimize union followed by select star followed by filesink + // It creates sub-directories in the final output, so should not be turned on in systems + // where MAPREDUCE-1501 is not present + HIVEOPTIMIZEUNIONSELECTSTAR("hive.optimize.union.selectstar", false), // Indexes HIVEOPTINDEXFILTER_COMPACT_MINSIZE("hive.optimize.index.filter.compact.minsize", (long) 5 * 1024 * 1024 * 1024), // 5G Index: ql/src/test/results/clientpositive/union_remove_3.q.out =================================================================== --- ql/src/test/results/clientpositive/union_remove_3.q.out (revision 0) +++ ql/src/test/results/clientpositive/union_remove_3.q.out (working copy) @@ -0,0 +1,267 @@ +PREHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 3 subqueries is performed (all of which are map-only queries) +-- followed by select star and a file sink. +-- There is no need for any optimization, since the whole query can be processed in +-- a single map-only job +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- off +-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23) +-- Since this test creates sub-directories for the output table outputTbl1, it might be easier +-- to run the test only on hadoop 23 + +create table inputTbl1(key string, val string) stored as textfile +PREHOOK: type: CREATETABLE +POSTHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 3 subqueries is performed (all of which are map-only queries) +-- followed by select star and a file sink. +-- There is no need for any optimization, since the whole query can be processed in +-- a single map-only job +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- off +-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23) +-- Since this test creates sub-directories for the output table outputTbl1, it might be easier +-- to run the test only on hadoop 23 + +create table inputTbl1(key string, val string) stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@inputTbl1 +PREHOOK: query: create table outputTbl1(key string, values bigint) stored as textfile +PREHOOK: type: CREATETABLE +POSTHOOK: query: create table outputTbl1(key string, values bigint) stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@outputTbl1 +PREHOOK: query: load data local inpath '../data/files/T1.txt' into table inputTbl1 +PREHOOK: type: LOAD +PREHOOK: Output: default@inputtbl1 +POSTHOOK: query: load data local inpath '../data/files/T1.txt' into table inputTbl1 +POSTHOOK: type: LOAD +POSTHOOK: Output: default@inputtbl1 +PREHOOK: query: explain +insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, 1 as values from inputTbl1 + UNION ALL + SELECT key, 2 as values from inputTbl1 + UNION ALL + SELECT key, 3 as values from inputTbl1 +) a +PREHOOK: type: QUERY +POSTHOOK: query: explain +insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, 1 as values from inputTbl1 + UNION ALL + SELECT key, 2 as values from inputTbl1 + UNION ALL + SELECT key, 3 as values from inputTbl1 +) a +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_UNION (TOK_UNION (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME inputTbl1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR 1 values)))) (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME inputTbl1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR 2 values))))) (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME inputTbl1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR 3 values))))) a)) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME outputTbl1))) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + null-subquery1-subquery1:a-subquery1-subquery1:inputtbl1 + TableScan + alias: inputtbl1 + Select Operator + expressions: + expr: key + type: string + expr: 1 + type: int + outputColumnNames: _col0, _col1 + Union + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: int + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: UDFToLong(_col1) + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + null-subquery1-subquery2:a-subquery1-subquery2:inputtbl1 + TableScan + alias: inputtbl1 + Select Operator + expressions: + expr: key + type: string + expr: 2 + type: int + outputColumnNames: _col0, _col1 + Union + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: int + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: UDFToLong(_col1) + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + null-subquery2:a-subquery2:inputtbl1 + TableScan + alias: inputtbl1 + Select Operator + expressions: + expr: key + type: string + expr: 3 + type: int + outputColumnNames: _col0, _col1 + Union + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: int + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: UDFToLong(_col1) + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + + +PREHOOK: query: insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, 1 as values from inputTbl1 + UNION ALL + SELECT key, 2 as values from inputTbl1 + UNION ALL + SELECT key, 3 as values from inputTbl1 +) a +PREHOOK: type: QUERY +PREHOOK: Input: default@inputtbl1 +PREHOOK: Output: default@outputtbl1 +POSTHOOK: query: insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, 1 as values from inputTbl1 + UNION ALL + SELECT key, 2 as values from inputTbl1 + UNION ALL + SELECT key, 3 as values from inputTbl1 +) a +POSTHOOK: type: QUERY +POSTHOOK: Input: default@inputtbl1 +POSTHOOK: Output: default@outputtbl1 +POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.values EXPRESSION [] +PREHOOK: query: desc formatted outputTbl1 +PREHOOK: type: DESCTABLE +POSTHOOK: query: desc formatted outputTbl1 +POSTHOOK: type: DESCTABLE +POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.values EXPRESSION [] +# col_name data_type comment + +key string None +values bigint None + +# Detailed Table Information +Database: default +#### A masked pattern was here #### +Protect Mode: None +Retention: 0 +#### A masked pattern was here #### +Table Type: MANAGED_TABLE +Table Parameters: +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +InputFormat: org.apache.hadoop.mapred.TextInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: select * from outputTbl1 +PREHOOK: type: QUERY +PREHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +POSTHOOK: query: select * from outputTbl1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.values EXPRESSION [] +1 3 +1 1 +1 2 +2 3 +2 1 +2 2 +3 3 +3 1 +3 2 +7 3 +7 1 +7 2 +8 3 +8 1 +8 2 +8 3 +8 1 +8 2 Index: ql/src/test/results/clientpositive/union_remove_17.q.out =================================================================== --- ql/src/test/results/clientpositive/union_remove_17.q.out (revision 0) +++ ql/src/test/results/clientpositive/union_remove_17.q.out (working copy) @@ -0,0 +1,266 @@ +PREHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 2 map-reduce subqueries is performed followed by select star and a file sink +-- and the results are written to a table using dynamic partitions. +-- There is no need for this optimization, since the query is a map-only query. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- off +-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23) +-- Since this test creates sub-directories for the output table outputTbl1, it might be easier +-- to run the test only on hadoop 23 + +create table inputTbl1(key string, val string) stored as textfile +PREHOOK: type: CREATETABLE +POSTHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 2 map-reduce subqueries is performed followed by select star and a file sink +-- and the results are written to a table using dynamic partitions. +-- There is no need for this optimization, since the query is a map-only query. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- off +-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23) +-- Since this test creates sub-directories for the output table outputTbl1, it might be easier +-- to run the test only on hadoop 23 + +create table inputTbl1(key string, val string) stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@inputTbl1 +PREHOOK: query: create table outputTbl1(key string, values bigint) partitioned by (ds string) stored as rcfile +PREHOOK: type: CREATETABLE +POSTHOOK: query: create table outputTbl1(key string, values bigint) partitioned by (ds string) stored as rcfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@outputTbl1 +PREHOOK: query: load data local inpath '../data/files/T1.txt' into table inputTbl1 +PREHOOK: type: LOAD +PREHOOK: Output: default@inputtbl1 +POSTHOOK: query: load data local inpath '../data/files/T1.txt' into table inputTbl1 +POSTHOOK: type: LOAD +POSTHOOK: Output: default@inputtbl1 +PREHOOK: query: explain +insert overwrite table outputTbl1 partition (ds) +SELECT * +FROM ( + SELECT key, 1 as values, '1' as ds from inputTbl1 + UNION ALL + SELECT key, 2 as values, '2' as ds from inputTbl1 +) a +PREHOOK: type: QUERY +POSTHOOK: query: explain +insert overwrite table outputTbl1 partition (ds) +SELECT * +FROM ( + SELECT key, 1 as values, '1' as ds from inputTbl1 + UNION ALL + SELECT key, 2 as values, '2' as ds from inputTbl1 +) a +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_UNION (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME inputTbl1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR 1 values) (TOK_SELEXPR '1' ds)))) (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME inputTbl1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR 2 values) (TOK_SELEXPR '2' ds))))) a)) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME outputTbl1) (TOK_PARTSPEC (TOK_PARTVAL ds)))) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + null-subquery1:a-subquery1:inputtbl1 + TableScan + alias: inputtbl1 + Select Operator + expressions: + expr: key + type: string + expr: 1 + type: int + expr: '1' + type: string + outputColumnNames: _col0, _col1, _col2 + Union + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: int + expr: _col2 + type: string + outputColumnNames: _col0, _col1, _col2 + Select Operator + expressions: + expr: _col0 + type: string + expr: UDFToLong(_col1) + type: bigint + expr: _col2 + type: string + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.outputtbl1 + null-subquery2:a-subquery2:inputtbl1 + TableScan + alias: inputtbl1 + Select Operator + expressions: + expr: key + type: string + expr: 2 + type: int + expr: '2' + type: string + outputColumnNames: _col0, _col1, _col2 + Union + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: int + expr: _col2 + type: string + outputColumnNames: _col0, _col1, _col2 + Select Operator + expressions: + expr: _col0 + type: string + expr: UDFToLong(_col1) + type: bigint + expr: _col2 + type: string + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.outputtbl1 + + Stage: Stage-0 + Move Operator + tables: + partition: + ds + replace: true + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.outputtbl1 + + +PREHOOK: query: insert overwrite table outputTbl1 partition (ds) +SELECT * +FROM ( + SELECT key, 1 as values, '1' as ds from inputTbl1 + UNION ALL + SELECT key, 2 as values, '2' as ds from inputTbl1 +) a +PREHOOK: type: QUERY +PREHOOK: Input: default@inputtbl1 +PREHOOK: Output: default@outputtbl1 +POSTHOOK: query: insert overwrite table outputTbl1 partition (ds) +SELECT * +FROM ( + SELECT key, 1 as values, '1' as ds from inputTbl1 + UNION ALL + SELECT key, 2 as values, '2' as ds from inputTbl1 +) a +POSTHOOK: type: QUERY +POSTHOOK: Input: default@inputtbl1 +POSTHOOK: Output: default@outputtbl1@ds=1 +POSTHOOK: Output: default@outputtbl1@ds=2 +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=1).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=1).values EXPRESSION [] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=2).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=2).values EXPRESSION [] +PREHOOK: query: desc formatted outputTbl1 +PREHOOK: type: DESCTABLE +POSTHOOK: query: desc formatted outputTbl1 +POSTHOOK: type: DESCTABLE +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=1).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=1).values EXPRESSION [] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=2).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=2).values EXPRESSION [] +# col_name data_type comment + +key string None +values bigint None + +# Partition Information +# col_name data_type comment + +ds string None + +# Detailed Table Information +Database: default +#### A masked pattern was here #### +Protect Mode: None +Retention: 0 +#### A masked pattern was here #### +Table Type: MANAGED_TABLE +Table Parameters: +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe +InputFormat: org.apache.hadoop.hive.ql.io.RCFileInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.RCFileOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: show partitions outputTbl1 +PREHOOK: type: SHOWPARTITIONS +POSTHOOK: query: show partitions outputTbl1 +POSTHOOK: type: SHOWPARTITIONS +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=1).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=1).values EXPRESSION [] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=2).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=2).values EXPRESSION [] +ds=1 +ds=2 +PREHOOK: query: select * from outputTbl1 where ds = '1' +PREHOOK: type: QUERY +PREHOOK: Input: default@outputtbl1@ds=1 +#### A masked pattern was here #### +POSTHOOK: query: select * from outputTbl1 where ds = '1' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@outputtbl1@ds=1 +#### A masked pattern was here #### +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=1).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=1).values EXPRESSION [] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=2).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=2).values EXPRESSION [] +1 1 1 +2 1 1 +3 1 1 +7 1 1 +8 1 1 +8 1 1 +PREHOOK: query: select * from outputTbl1 where ds = '2' +PREHOOK: type: QUERY +PREHOOK: Input: default@outputtbl1@ds=2 +#### A masked pattern was here #### +POSTHOOK: query: select * from outputTbl1 where ds = '2' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@outputtbl1@ds=2 +#### A masked pattern was here #### +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=1).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=1).values EXPRESSION [] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=2).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=2).values EXPRESSION [] +1 2 2 +2 2 2 +3 2 2 +7 2 2 +8 2 2 +8 2 2 Index: ql/src/test/results/clientpositive/union_remove_12.q.out =================================================================== --- ql/src/test/results/clientpositive/union_remove_12.q.out (revision 0) +++ ql/src/test/results/clientpositive/union_remove_12.q.out (working copy) @@ -0,0 +1,325 @@ +PREHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 2 subqueries is performed (one of which is a map-only query, and the +-- other one is a map-join query), followed by select star and a file sink. +-- There is no need for the union optimization, since the whole query can be performed +-- in a single map-only job + +-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23) +-- Since this test creates sub-directories for the output table outputTbl1, it might be easier +-- to run the test only on hadoop 23 + +-- The final file format is different from the input and intermediate file format. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- on + +create table inputTbl1(key string, val string) stored as textfile +PREHOOK: type: CREATETABLE +POSTHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 2 subqueries is performed (one of which is a map-only query, and the +-- other one is a map-join query), followed by select star and a file sink. +-- There is no need for the union optimization, since the whole query can be performed +-- in a single map-only job + +-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23) +-- Since this test creates sub-directories for the output table outputTbl1, it might be easier +-- to run the test only on hadoop 23 + +-- The final file format is different from the input and intermediate file format. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- on + +create table inputTbl1(key string, val string) stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@inputTbl1 +PREHOOK: query: create table outputTbl1(key string, values bigint) stored as rcfile +PREHOOK: type: CREATETABLE +POSTHOOK: query: create table outputTbl1(key string, values bigint) stored as rcfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@outputTbl1 +PREHOOK: query: load data local inpath '../data/files/T1.txt' into table inputTbl1 +PREHOOK: type: LOAD +PREHOOK: Output: default@inputtbl1 +POSTHOOK: query: load data local inpath '../data/files/T1.txt' into table inputTbl1 +POSTHOOK: type: LOAD +POSTHOOK: Output: default@inputtbl1 +PREHOOK: query: explain +insert overwrite table outputTbl1 +SELECT * FROM +( +select key, 1 as values from inputTbl1 +union all +select /*+ mapjoin(a) */ a.key as key, b.val as values +FROM inputTbl1 a join inputTbl1 b on a.key=b.key +)c +PREHOOK: type: QUERY +POSTHOOK: query: explain +insert overwrite table outputTbl1 +SELECT * FROM +( +select key, 1 as values from inputTbl1 +union all +select /*+ mapjoin(a) */ a.key as key, b.val as values +FROM inputTbl1 a join inputTbl1 b on a.key=b.key +)c +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_UNION (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME inputTbl1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR 1 values)))) (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME inputTbl1) a) (TOK_TABREF (TOK_TABNAME inputTbl1) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_HINTLIST (TOK_HINT TOK_MAPJOIN (TOK_HINTARGLIST a))) (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key) key) (TOK_SELEXPR (. (TOK_TABLE_OR_COL b) val) values))))) c)) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME outputTbl1))) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)))) + +STAGE DEPENDENCIES: + Stage-11 is a root stage + Stage-1 depends on stages: Stage-11 + Stage-2 depends on stages: Stage-1 + Stage-3 depends on stages: Stage-2 + Stage-8 depends on stages: Stage-3 , consists of Stage-5, Stage-4, Stage-6 + Stage-5 + Stage-0 depends on stages: Stage-5, Stage-4, Stage-7 + Stage-4 + Stage-6 + Stage-7 depends on stages: Stage-6 + +STAGE PLANS: + Stage: Stage-11 + Map Reduce Local Work + Alias -> Map Local Tables: + null-subquery2:c-subquery2:a + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + null-subquery2:c-subquery2:a + TableScan + alias: a + HashTable Sink Operator + condition expressions: + 0 {key} + 1 {val} + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 1 + + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + null-subquery2:c-subquery2:b + TableScan + alias: b + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} + 1 {val} + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + outputColumnNames: _col0, _col5 + Position of Big Table: 1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + Local Work: + Map Reduce Local Work + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + Select Operator + expressions: + expr: _col0 + type: string + expr: _col5 + type: string + outputColumnNames: _col0, _col5 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col5 + type: string + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-3 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + TableScan + Union + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: string + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: UDFToLong(_col1) + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.outputtbl1 + null-subquery1:c-subquery1:inputtbl1 + TableScan + alias: inputtbl1 + Select Operator + expressions: + expr: key + type: string + expr: 1 + type: int + outputColumnNames: _col0, _col1 + Union + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: string + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: UDFToLong(_col1) + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.outputtbl1 + + Stage: Stage-8 + Conditional Operator + + Stage: Stage-5 + Move Operator + files: + hdfs directory: true +#### A masked pattern was here #### + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.outputtbl1 + + Stage: Stage-4 + Block level merge + + Stage: Stage-6 + Block level merge + + Stage: Stage-7 + Move Operator + files: + hdfs directory: true +#### A masked pattern was here #### + + +PREHOOK: query: insert overwrite table outputTbl1 +SELECT * FROM +( +select key, 1 as values from inputTbl1 +union all +select /*+ mapjoin(a) */ a.key as key, b.val as values +FROM inputTbl1 a join inputTbl1 b on a.key=b.key +)c +PREHOOK: type: QUERY +PREHOOK: Input: default@inputtbl1 +PREHOOK: Output: default@outputtbl1 +POSTHOOK: query: insert overwrite table outputTbl1 +SELECT * FROM +( +select key, 1 as values from inputTbl1 +union all +select /*+ mapjoin(a) */ a.key as key, b.val as values +FROM inputTbl1 a join inputTbl1 b on a.key=b.key +)c +POSTHOOK: type: QUERY +POSTHOOK: Input: default@inputtbl1 +POSTHOOK: Output: default@outputtbl1 +POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(inputtbl1)a.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.values EXPRESSION [(inputtbl1)b.FieldSchema(name:val, type:string, comment:null), ] +PREHOOK: query: desc formatted outputTbl1 +PREHOOK: type: DESCTABLE +POSTHOOK: query: desc formatted outputTbl1 +POSTHOOK: type: DESCTABLE +POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(inputtbl1)a.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.values EXPRESSION [(inputtbl1)b.FieldSchema(name:val, type:string, comment:null), ] +# col_name data_type comment + +key string None +values bigint None + +# Detailed Table Information +Database: default +#### A masked pattern was here #### +Protect Mode: None +Retention: 0 +#### A masked pattern was here #### +Table Type: MANAGED_TABLE +Table Parameters: +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe +InputFormat: org.apache.hadoop.hive.ql.io.RCFileInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.RCFileOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: select * from outputTbl1 +PREHOOK: type: QUERY +PREHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +POSTHOOK: query: select * from outputTbl1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(inputtbl1)a.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.values EXPRESSION [(inputtbl1)b.FieldSchema(name:val, type:string, comment:null), ] +1 11 +2 12 +3 13 +7 17 +8 18 +8 18 +8 28 +8 28 +1 1 +2 1 +3 1 +7 1 +8 1 +8 1 Index: ql/src/test/results/clientpositive/union_remove_5.q.out =================================================================== --- ql/src/test/results/clientpositive/union_remove_5.q.out (revision 0) +++ ql/src/test/results/clientpositive/union_remove_5.q.out (working copy) @@ -0,0 +1,315 @@ +PREHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 3 subqueries is performed (exactly one of which requires a map-reduce job) +-- followed by select star and a file sink. +-- There is no need to write the temporary results of the sub-queries, and then read them +-- again to process the union. The union can be removed completely. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- on + +-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23) +-- Since this test creates sub-directories for the output table outputTbl1, it might be easier +-- to run the test only on hadoop 23 + +create table inputTbl1(key string, val string) stored as textfile +PREHOOK: type: CREATETABLE +POSTHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 3 subqueries is performed (exactly one of which requires a map-reduce job) +-- followed by select star and a file sink. +-- There is no need to write the temporary results of the sub-queries, and then read them +-- again to process the union. The union can be removed completely. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- on + +-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23) +-- Since this test creates sub-directories for the output table outputTbl1, it might be easier +-- to run the test only on hadoop 23 + +create table inputTbl1(key string, val string) stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@inputTbl1 +PREHOOK: query: create table outputTbl1(key string, values bigint) stored as textfile +PREHOOK: type: CREATETABLE +POSTHOOK: query: create table outputTbl1(key string, values bigint) stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@outputTbl1 +PREHOOK: query: load data local inpath '../data/files/T1.txt' into table inputTbl1 +PREHOOK: type: LOAD +PREHOOK: Output: default@inputtbl1 +POSTHOOK: query: load data local inpath '../data/files/T1.txt' into table inputTbl1 +POSTHOOK: type: LOAD +POSTHOOK: Output: default@inputtbl1 +PREHOOK: query: explain +insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, 1 as values from inputTbl1 + UNION ALL + SELECT key, 2 as values from inputTbl1 +) a +PREHOOK: type: QUERY +POSTHOOK: query: explain +insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, 1 as values from inputTbl1 + UNION ALL + SELECT key, 2 as values from inputTbl1 +) a +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_UNION (TOK_UNION (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME inputTbl1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count 1) values)) (TOK_GROUPBY (TOK_TABLE_OR_COL key)))) (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME inputTbl1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR 1 values))))) (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME inputTbl1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR 2 values))))) a)) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME outputTbl1))) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-6 depends on stages: Stage-1, Stage-7, Stage-8 , consists of Stage-3, Stage-2, Stage-4 + Stage-3 + Stage-0 depends on stages: Stage-3, Stage-2, Stage-5 + Stage-2 + Stage-4 + Stage-5 depends on stages: Stage-4 + Stage-7 is a root stage + Stage-8 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + null-subquery2:a-subquery2:inputtbl1 + TableScan + alias: inputtbl1 + Select Operator + expressions: + expr: key + type: string + expr: 2 + type: int + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + + Stage: Stage-6 + Conditional Operator + + Stage: Stage-3 + Move Operator + files: + hdfs directory: true +#### A masked pattern was here #### + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + + Stage: Stage-4 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + + Stage: Stage-5 + Move Operator + files: + hdfs directory: true +#### A masked pattern was here #### + + Stage: Stage-7 + Map Reduce + Alias -> Map Operator Tree: + null-subquery1-subquery1:a-subquery1-subquery1:inputtbl1 + TableScan + alias: inputtbl1 + Select Operator + expressions: + expr: key + type: string + outputColumnNames: key + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: key + type: string + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: -1 + value expressions: + expr: _col1 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: string + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + + Stage: Stage-8 + Map Reduce + Alias -> Map Operator Tree: + null-subquery1-subquery2:a-subquery1-subquery2:inputtbl1 + TableScan + alias: inputtbl1 + Select Operator + expressions: + expr: key + type: string + expr: 1 + type: int + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + + +PREHOOK: query: insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, 1 as values from inputTbl1 + UNION ALL + SELECT key, 2 as values from inputTbl1 +) a +PREHOOK: type: QUERY +PREHOOK: Input: default@inputtbl1 +PREHOOK: Output: default@outputtbl1 +POSTHOOK: query: insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, 1 as values from inputTbl1 + UNION ALL + SELECT key, 2 as values from inputTbl1 +) a +POSTHOOK: type: QUERY +POSTHOOK: Input: default@inputtbl1 +POSTHOOK: Output: default@outputtbl1 +POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.values EXPRESSION [(inputtbl1)inputtbl1.null, ] +PREHOOK: query: desc formatted outputTbl1 +PREHOOK: type: DESCTABLE +POSTHOOK: query: desc formatted outputTbl1 +POSTHOOK: type: DESCTABLE +POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.values EXPRESSION [(inputtbl1)inputtbl1.null, ] +# col_name data_type comment + +key string None +values bigint None + +# Detailed Table Information +Database: default +#### A masked pattern was here #### +Protect Mode: None +Retention: 0 +#### A masked pattern was here #### +Table Type: MANAGED_TABLE +Table Parameters: +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +InputFormat: org.apache.hadoop.mapred.TextInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: select * from outputTbl1 +PREHOOK: type: QUERY +PREHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +POSTHOOK: query: select * from outputTbl1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.values EXPRESSION [(inputtbl1)inputtbl1.null, ] +1 2 +2 2 +3 2 +7 2 +8 2 +8 2 +1 1 +2 1 +3 1 +7 1 +8 2 +1 1 +2 1 +3 1 +7 1 +8 1 +8 1 Index: ql/src/test/results/clientpositive/union_remove_14.q.out =================================================================== --- ql/src/test/results/clientpositive/union_remove_14.q.out (revision 0) +++ ql/src/test/results/clientpositive/union_remove_14.q.out (working copy) @@ -0,0 +1,423 @@ +PREHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 2 subqueries is performed (one of which is a map-only query, and the +-- other one contains a join, which should be performed as a map-join query at runtime), +-- followed by select star and a file sink. +-- The union selectstar optimization should be performed, and the union should be removed. + +-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23) +-- Since this test creates sub-directories for the output table outputTbl1, it might be easier +-- to run the test only on hadoop 23 + +-- The final file format is different from the input and intermediate file format. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- on + +create table inputTbl1(key string, val string) stored as textfile +PREHOOK: type: CREATETABLE +POSTHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 2 subqueries is performed (one of which is a map-only query, and the +-- other one contains a join, which should be performed as a map-join query at runtime), +-- followed by select star and a file sink. +-- The union selectstar optimization should be performed, and the union should be removed. + +-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23) +-- Since this test creates sub-directories for the output table outputTbl1, it might be easier +-- to run the test only on hadoop 23 + +-- The final file format is different from the input and intermediate file format. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- on + +create table inputTbl1(key string, val string) stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@inputTbl1 +PREHOOK: query: create table outputTbl1(key string, values bigint) stored as rcfile +PREHOOK: type: CREATETABLE +POSTHOOK: query: create table outputTbl1(key string, values bigint) stored as rcfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@outputTbl1 +PREHOOK: query: load data local inpath '../data/files/T1.txt' into table inputTbl1 +PREHOOK: type: LOAD +PREHOOK: Output: default@inputtbl1 +POSTHOOK: query: load data local inpath '../data/files/T1.txt' into table inputTbl1 +POSTHOOK: type: LOAD +POSTHOOK: Output: default@inputtbl1 +PREHOOK: query: explain +insert overwrite table outputTbl1 +SELECT * FROM +( +select key, 1 as values from inputTbl1 +union all +select a.key as key, b.val as values +FROM inputTbl1 a join inputTbl1 b on a.key=b.key +)c +PREHOOK: type: QUERY +POSTHOOK: query: explain +insert overwrite table outputTbl1 +SELECT * FROM +( +select key, 1 as values from inputTbl1 +union all +select a.key as key, b.val as values +FROM inputTbl1 a join inputTbl1 b on a.key=b.key +)c +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_UNION (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME inputTbl1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR 1 values)))) (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME inputTbl1) a) (TOK_TABREF (TOK_TABNAME inputTbl1) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key) key) (TOK_SELEXPR (. (TOK_TABLE_OR_COL b) val) values))))) c)) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME outputTbl1))) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)))) + +STAGE DEPENDENCIES: + Stage-12 is a root stage , consists of Stage-13, Stage-14, Stage-1 + Stage-13 has a backup stage: Stage-1 + Stage-10 depends on stages: Stage-13 + Stage-2 depends on stages: Stage-1, Stage-10, Stage-11 + Stage-7 depends on stages: Stage-2 , consists of Stage-4, Stage-3, Stage-5 + Stage-4 + Stage-0 depends on stages: Stage-4, Stage-3, Stage-6 + Stage-3 + Stage-5 + Stage-6 depends on stages: Stage-5 + Stage-14 has a backup stage: Stage-1 + Stage-11 depends on stages: Stage-14 + Stage-1 + +STAGE PLANS: + Stage: Stage-12 + Conditional Operator + + Stage: Stage-13 + Map Reduce Local Work + Alias -> Map Local Tables: + null-subquery2:c-subquery2:b + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + null-subquery2:c-subquery2:b + TableScan + alias: b + HashTable Sink Operator + condition expressions: + 0 {key} + 1 {val} + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 0 + + Stage: Stage-10 + Map Reduce + Alias -> Map Operator Tree: + null-subquery2:c-subquery2:a + TableScan + alias: a + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} + 1 {val} + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + outputColumnNames: _col0, _col5 + Position of Big Table: 0 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col5 + type: string + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + Local Work: + Map Reduce Local Work + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + TableScan + Union + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: string + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: UDFToLong(_col1) + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.outputtbl1 + null-subquery1:c-subquery1:inputtbl1 + TableScan + alias: inputtbl1 + Select Operator + expressions: + expr: key + type: string + expr: 1 + type: int + outputColumnNames: _col0, _col1 + Union + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: string + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: UDFToLong(_col1) + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.outputtbl1 + + Stage: Stage-7 + Conditional Operator + + Stage: Stage-4 + Move Operator + files: + hdfs directory: true +#### A masked pattern was here #### + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.outputtbl1 + + Stage: Stage-3 + Block level merge + + Stage: Stage-5 + Block level merge + + Stage: Stage-6 + Move Operator + files: + hdfs directory: true +#### A masked pattern was here #### + + Stage: Stage-14 + Map Reduce Local Work + Alias -> Map Local Tables: + null-subquery2:c-subquery2:a + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + null-subquery2:c-subquery2:a + TableScan + alias: a + HashTable Sink Operator + condition expressions: + 0 {key} + 1 {val} + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 1 + + Stage: Stage-11 + Map Reduce + Alias -> Map Operator Tree: + null-subquery2:c-subquery2:b + TableScan + alias: b + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} + 1 {val} + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + outputColumnNames: _col0, _col5 + Position of Big Table: 1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col5 + type: string + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + Local Work: + Map Reduce Local Work + + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + null-subquery2:c-subquery2:a + TableScan + alias: a + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 0 + value expressions: + expr: key + type: string + null-subquery2:c-subquery2:b + TableScan + alias: b + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 1 + value expressions: + expr: val + type: string + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {VALUE._col0} + 1 {VALUE._col1} + handleSkewJoin: false + outputColumnNames: _col0, _col5 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col5 + type: string + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + +PREHOOK: query: insert overwrite table outputTbl1 +SELECT * FROM +( +select key, 1 as values from inputTbl1 +union all +select a.key as key, b.val as values +FROM inputTbl1 a join inputTbl1 b on a.key=b.key +)c +PREHOOK: type: QUERY +PREHOOK: Input: default@inputtbl1 +PREHOOK: Output: default@outputtbl1 +POSTHOOK: query: insert overwrite table outputTbl1 +SELECT * FROM +( +select key, 1 as values from inputTbl1 +union all +select a.key as key, b.val as values +FROM inputTbl1 a join inputTbl1 b on a.key=b.key +)c +POSTHOOK: type: QUERY +POSTHOOK: Input: default@inputtbl1 +POSTHOOK: Output: default@outputtbl1 +POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(inputtbl1)a.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.values EXPRESSION [(inputtbl1)b.FieldSchema(name:val, type:string, comment:null), ] +PREHOOK: query: desc formatted outputTbl1 +PREHOOK: type: DESCTABLE +POSTHOOK: query: desc formatted outputTbl1 +POSTHOOK: type: DESCTABLE +POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(inputtbl1)a.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.values EXPRESSION [(inputtbl1)b.FieldSchema(name:val, type:string, comment:null), ] +# col_name data_type comment + +key string None +values bigint None + +# Detailed Table Information +Database: default +#### A masked pattern was here #### +Protect Mode: None +Retention: 0 +#### A masked pattern was here #### +Table Type: MANAGED_TABLE +Table Parameters: +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe +InputFormat: org.apache.hadoop.hive.ql.io.RCFileInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.RCFileOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: select * from outputTbl1 +PREHOOK: type: QUERY +PREHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +POSTHOOK: query: select * from outputTbl1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(inputtbl1)a.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.values EXPRESSION [(inputtbl1)b.FieldSchema(name:val, type:string, comment:null), ] +1 11 +2 12 +3 13 +7 17 +8 18 +8 28 +8 18 +8 28 +1 1 +2 1 +3 1 +7 1 +8 1 +8 1 Index: ql/src/test/results/clientpositive/union_remove_7.q.out =================================================================== --- ql/src/test/results/clientpositive/union_remove_7.q.out (revision 0) +++ ql/src/test/results/clientpositive/union_remove_7.q.out (working copy) @@ -0,0 +1,267 @@ +PREHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 2 map-reduce subqueries is performed followed by select star and a file sink +-- There is no need to write the temporary results of the sub-queries, and then read them +-- again to process the union. The union can be removed completely. +-- The final file format is different from the input and intermediate file format. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- off + +-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23) +-- Since this test creates sub-directories for the output table outputTbl1, it might be easier +-- to run the test only on hadoop 23 + +create table inputTbl1(key string, val string) stored as textfile +PREHOOK: type: CREATETABLE +POSTHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 2 map-reduce subqueries is performed followed by select star and a file sink +-- There is no need to write the temporary results of the sub-queries, and then read them +-- again to process the union. The union can be removed completely. +-- The final file format is different from the input and intermediate file format. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- off + +-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23) +-- Since this test creates sub-directories for the output table outputTbl1, it might be easier +-- to run the test only on hadoop 23 + +create table inputTbl1(key string, val string) stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@inputTbl1 +PREHOOK: query: create table outputTbl1(key string, values bigint) stored as rcfile +PREHOOK: type: CREATETABLE +POSTHOOK: query: create table outputTbl1(key string, values bigint) stored as rcfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@outputTbl1 +PREHOOK: query: load data local inpath '../data/files/T1.txt' into table inputTbl1 +PREHOOK: type: LOAD +PREHOOK: Output: default@inputtbl1 +POSTHOOK: query: load data local inpath '../data/files/T1.txt' into table inputTbl1 +POSTHOOK: type: LOAD +POSTHOOK: Output: default@inputtbl1 +PREHOOK: query: explain +insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values from inputTbl1 group by key +) a +PREHOOK: type: QUERY +POSTHOOK: query: explain +insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values from inputTbl1 group by key +) a +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_UNION (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME inputTbl1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count 1) values)) (TOK_GROUPBY (TOK_TABLE_OR_COL key)))) (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME inputTbl1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count 1) values)) (TOK_GROUPBY (TOK_TABLE_OR_COL key))))) a)) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME outputTbl1))) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1, Stage-2 + Stage-2 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + null-subquery2:a-subquery2:inputtbl1 + TableScan + alias: inputtbl1 + Select Operator + expressions: + expr: key + type: string + outputColumnNames: key + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: key + type: string + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: -1 + value expressions: + expr: _col1 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: string + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.outputtbl1 + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.outputtbl1 + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: + null-subquery1:a-subquery1:inputtbl1 + TableScan + alias: inputtbl1 + Select Operator + expressions: + expr: key + type: string + outputColumnNames: key + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: key + type: string + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: -1 + value expressions: + expr: _col1 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: string + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.outputtbl1 + + +PREHOOK: query: insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values from inputTbl1 group by key +) a +PREHOOK: type: QUERY +PREHOOK: Input: default@inputtbl1 +PREHOOK: Output: default@outputtbl1 +POSTHOOK: query: insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values from inputTbl1 group by key +) a +POSTHOOK: type: QUERY +POSTHOOK: Input: default@inputtbl1 +POSTHOOK: Output: default@outputtbl1 +POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +PREHOOK: query: desc formatted outputTbl1 +PREHOOK: type: DESCTABLE +POSTHOOK: query: desc formatted outputTbl1 +POSTHOOK: type: DESCTABLE +POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +# col_name data_type comment + +key string None +values bigint None + +# Detailed Table Information +Database: default +#### A masked pattern was here #### +Protect Mode: None +Retention: 0 +#### A masked pattern was here #### +Table Type: MANAGED_TABLE +Table Parameters: +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe +InputFormat: org.apache.hadoop.hive.ql.io.RCFileInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.RCFileOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: select * from outputTbl1 +PREHOOK: type: QUERY +PREHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +POSTHOOK: query: select * from outputTbl1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +1 1 +2 1 +3 1 +7 1 +8 2 +1 1 +2 1 +3 1 +7 1 +8 2 Index: ql/src/test/results/clientpositive/union_remove_2.q.out =================================================================== --- ql/src/test/results/clientpositive/union_remove_2.q.out (revision 0) +++ ql/src/test/results/clientpositive/union_remove_2.q.out (working copy) @@ -0,0 +1,267 @@ +PREHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 3 subqueries is performed (exactly one of which requires a map-reduce job) +-- followed by select star and a file sink. +-- There is no need to write the temporary results of the sub-queries, and then read them +-- again to process the union. The union can be removed completely. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- off +-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23) +-- Since this test creates sub-directories for the output table outputTbl1, it might be easier +-- to run the test only on hadoop 23 + +create table inputTbl1(key string, val string) stored as textfile +PREHOOK: type: CREATETABLE +POSTHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 3 subqueries is performed (exactly one of which requires a map-reduce job) +-- followed by select star and a file sink. +-- There is no need to write the temporary results of the sub-queries, and then read them +-- again to process the union. The union can be removed completely. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- off +-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23) +-- Since this test creates sub-directories for the output table outputTbl1, it might be easier +-- to run the test only on hadoop 23 + +create table inputTbl1(key string, val string) stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@inputTbl1 +PREHOOK: query: create table outputTbl1(key string, values bigint) stored as textfile +PREHOOK: type: CREATETABLE +POSTHOOK: query: create table outputTbl1(key string, values bigint) stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@outputTbl1 +PREHOOK: query: load data local inpath '../data/files/T1.txt' into table inputTbl1 +PREHOOK: type: LOAD +PREHOOK: Output: default@inputtbl1 +POSTHOOK: query: load data local inpath '../data/files/T1.txt' into table inputTbl1 +POSTHOOK: type: LOAD +POSTHOOK: Output: default@inputtbl1 +PREHOOK: query: explain +insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, 1 as values from inputTbl1 + UNION ALL + SELECT key, 2 as values from inputTbl1 +) a +PREHOOK: type: QUERY +POSTHOOK: query: explain +insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, 1 as values from inputTbl1 + UNION ALL + SELECT key, 2 as values from inputTbl1 +) a +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_UNION (TOK_UNION (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME inputTbl1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count 1) values)) (TOK_GROUPBY (TOK_TABLE_OR_COL key)))) (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME inputTbl1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR 1 values))))) (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME inputTbl1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR 2 values))))) a)) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME outputTbl1))) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1, Stage-2, Stage-3 + Stage-2 is a root stage + Stage-3 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + null-subquery2:a-subquery2:inputtbl1 + TableScan + alias: inputtbl1 + Select Operator + expressions: + expr: key + type: string + expr: 2 + type: int + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: + null-subquery1-subquery1:a-subquery1-subquery1:inputtbl1 + TableScan + alias: inputtbl1 + Select Operator + expressions: + expr: key + type: string + outputColumnNames: key + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: key + type: string + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: -1 + value expressions: + expr: _col1 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: string + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + + Stage: Stage-3 + Map Reduce + Alias -> Map Operator Tree: + null-subquery1-subquery2:a-subquery1-subquery2:inputtbl1 + TableScan + alias: inputtbl1 + Select Operator + expressions: + expr: key + type: string + expr: 1 + type: int + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + + +PREHOOK: query: insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, 1 as values from inputTbl1 + UNION ALL + SELECT key, 2 as values from inputTbl1 +) a +PREHOOK: type: QUERY +PREHOOK: Input: default@inputtbl1 +PREHOOK: Output: default@outputtbl1 +POSTHOOK: query: insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, 1 as values from inputTbl1 + UNION ALL + SELECT key, 2 as values from inputTbl1 +) a +POSTHOOK: type: QUERY +POSTHOOK: Input: default@inputtbl1 +POSTHOOK: Output: default@outputtbl1 +POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.values EXPRESSION [(inputtbl1)inputtbl1.null, ] +PREHOOK: query: desc formatted outputTbl1 +PREHOOK: type: DESCTABLE +POSTHOOK: query: desc formatted outputTbl1 +POSTHOOK: type: DESCTABLE +POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.values EXPRESSION [(inputtbl1)inputtbl1.null, ] +# col_name data_type comment + +key string None +values bigint None + +# Detailed Table Information +Database: default +#### A masked pattern was here #### +Protect Mode: None +Retention: 0 +#### A masked pattern was here #### +Table Type: MANAGED_TABLE +Table Parameters: +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +InputFormat: org.apache.hadoop.mapred.TextInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: select * from outputTbl1 +PREHOOK: type: QUERY +PREHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +POSTHOOK: query: select * from outputTbl1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.values EXPRESSION [(inputtbl1)inputtbl1.null, ] +1 2 +2 2 +3 2 +7 2 +8 2 +8 2 +1 1 +2 1 +3 1 +7 1 +8 2 +1 1 +2 1 +3 1 +7 1 +8 1 +8 1 Index: ql/src/test/results/clientpositive/union_remove_16.q.out =================================================================== --- ql/src/test/results/clientpositive/union_remove_16.q.out (revision 0) +++ ql/src/test/results/clientpositive/union_remove_16.q.out (working copy) @@ -0,0 +1,333 @@ +PREHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 2 map-reduce subqueries is performed followed by select star and a file sink +-- and the results are written to a table using dynamic partitions. +-- There is no need to write the temporary results of the sub-queries, and then read them +-- again to process the union. The union can be removed completely. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- on +-- Curently, this optimization does not work in the presence of dynamic partitions. +-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23) +-- Since this test creates sub-directories for the output table outputTbl1, it might be easier +-- to run the test only on hadoop 23 + +create table inputTbl1(key string, val string) stored as textfile +PREHOOK: type: CREATETABLE +POSTHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 2 map-reduce subqueries is performed followed by select star and a file sink +-- and the results are written to a table using dynamic partitions. +-- There is no need to write the temporary results of the sub-queries, and then read them +-- again to process the union. The union can be removed completely. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- on +-- Curently, this optimization does not work in the presence of dynamic partitions. +-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23) +-- Since this test creates sub-directories for the output table outputTbl1, it might be easier +-- to run the test only on hadoop 23 + +create table inputTbl1(key string, val string) stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@inputTbl1 +PREHOOK: query: create table outputTbl1(key string, values bigint) partitioned by (ds string) stored as rcfile +PREHOOK: type: CREATETABLE +POSTHOOK: query: create table outputTbl1(key string, values bigint) partitioned by (ds string) stored as rcfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@outputTbl1 +PREHOOK: query: load data local inpath '../data/files/T1.txt' into table inputTbl1 +PREHOOK: type: LOAD +PREHOOK: Output: default@inputtbl1 +POSTHOOK: query: load data local inpath '../data/files/T1.txt' into table inputTbl1 +POSTHOOK: type: LOAD +POSTHOOK: Output: default@inputtbl1 +PREHOOK: query: explain +insert overwrite table outputTbl1 partition (ds) +SELECT * +FROM ( + SELECT key, count(1) as values, '1' as ds from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values, '2' as ds from inputTbl1 group by key +) a +PREHOOK: type: QUERY +POSTHOOK: query: explain +insert overwrite table outputTbl1 partition (ds) +SELECT * +FROM ( + SELECT key, count(1) as values, '1' as ds from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values, '2' as ds from inputTbl1 group by key +) a +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_UNION (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME inputTbl1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count 1) values) (TOK_SELEXPR '1' ds)) (TOK_GROUPBY (TOK_TABLE_OR_COL key)))) (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME inputTbl1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count 1) values) (TOK_SELEXPR '2' ds)) (TOK_GROUPBY (TOK_TABLE_OR_COL key))))) a)) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME outputTbl1) (TOK_PARTSPEC (TOK_PARTVAL ds)))) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-6 depends on stages: Stage-1, Stage-7 , consists of Stage-3, Stage-2, Stage-4 + Stage-3 + Stage-0 depends on stages: Stage-3, Stage-2, Stage-5 + Stage-2 + Stage-4 + Stage-5 depends on stages: Stage-4 + Stage-7 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + null-subquery2:a-subquery2:inputtbl1 + TableScan + alias: inputtbl1 + Select Operator + expressions: + expr: key + type: string + outputColumnNames: key + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: key + type: string + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: -1 + value expressions: + expr: _col1 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: string + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + expr: '2' + type: string + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.outputtbl1 + + Stage: Stage-6 + Conditional Operator + + Stage: Stage-3 + Move Operator + files: + hdfs directory: true +#### A masked pattern was here #### + + Stage: Stage-0 + Move Operator + tables: + partition: + ds + replace: true + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.outputtbl1 + + Stage: Stage-2 + Block level merge + + Stage: Stage-4 + Block level merge + + Stage: Stage-5 + Move Operator + files: + hdfs directory: true +#### A masked pattern was here #### + + Stage: Stage-7 + Map Reduce + Alias -> Map Operator Tree: + null-subquery1:a-subquery1:inputtbl1 + TableScan + alias: inputtbl1 + Select Operator + expressions: + expr: key + type: string + outputColumnNames: key + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: key + type: string + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: -1 + value expressions: + expr: _col1 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: string + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + expr: '1' + type: string + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.outputtbl1 + + +PREHOOK: query: insert overwrite table outputTbl1 partition (ds) +SELECT * +FROM ( + SELECT key, count(1) as values, '1' as ds from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values, '2' as ds from inputTbl1 group by key +) a +PREHOOK: type: QUERY +PREHOOK: Input: default@inputtbl1 +PREHOOK: Output: default@outputtbl1 +POSTHOOK: query: insert overwrite table outputTbl1 partition (ds) +SELECT * +FROM ( + SELECT key, count(1) as values, '1' as ds from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values, '2' as ds from inputTbl1 group by key +) a +POSTHOOK: type: QUERY +POSTHOOK: Input: default@inputtbl1 +POSTHOOK: Output: default@outputtbl1@ds=1 +POSTHOOK: Output: default@outputtbl1@ds=2 +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=1).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=1).values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=2).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=2).values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +PREHOOK: query: desc formatted outputTbl1 +PREHOOK: type: DESCTABLE +POSTHOOK: query: desc formatted outputTbl1 +POSTHOOK: type: DESCTABLE +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=1).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=1).values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=2).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=2).values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +# col_name data_type comment + +key string None +values bigint None + +# Partition Information +# col_name data_type comment + +ds string None + +# Detailed Table Information +Database: default +#### A masked pattern was here #### +Protect Mode: None +Retention: 0 +#### A masked pattern was here #### +Table Type: MANAGED_TABLE +Table Parameters: +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe +InputFormat: org.apache.hadoop.hive.ql.io.RCFileInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.RCFileOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: show partitions outputTbl1 +PREHOOK: type: SHOWPARTITIONS +POSTHOOK: query: show partitions outputTbl1 +POSTHOOK: type: SHOWPARTITIONS +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=1).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=1).values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=2).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=2).values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +ds=1 +ds=2 +PREHOOK: query: select * from outputTbl1 where ds = '1' +PREHOOK: type: QUERY +PREHOOK: Input: default@outputtbl1@ds=1 +#### A masked pattern was here #### +POSTHOOK: query: select * from outputTbl1 where ds = '1' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@outputtbl1@ds=1 +#### A masked pattern was here #### +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=1).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=1).values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=2).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=2).values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +1 1 1 +2 1 1 +3 1 1 +7 1 1 +8 2 1 +PREHOOK: query: select * from outputTbl1 where ds = '2' +PREHOOK: type: QUERY +PREHOOK: Input: default@outputtbl1@ds=2 +#### A masked pattern was here #### +POSTHOOK: query: select * from outputTbl1 where ds = '2' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@outputtbl1@ds=2 +#### A masked pattern was here #### +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=1).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=1).values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=2).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=2).values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +1 1 2 +2 1 2 +3 1 2 +7 1 2 +8 2 2 Index: ql/src/test/results/clientpositive/union_remove_11.q.out =================================================================== --- ql/src/test/results/clientpositive/union_remove_11.q.out (revision 0) +++ ql/src/test/results/clientpositive/union_remove_11.q.out (working copy) @@ -0,0 +1,323 @@ +PREHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 2 subqueries is performed (one of which is a map-only query, and the +-- other one contains a nested union where also contains map only sub-queries), +-- followed by select star and a file sink. +-- There is no need for the union optimization, since the whole query can be performed +-- in a single map-only job +-- The final file format is different from the input and intermediate file format. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- on + +-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23) +-- Since this test creates sub-directories for the output table outputTbl1, it might be easier +-- to run the test only on hadoop 23 + +create table inputTbl1(key string, val string) stored as textfile +PREHOOK: type: CREATETABLE +POSTHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 2 subqueries is performed (one of which is a map-only query, and the +-- other one contains a nested union where also contains map only sub-queries), +-- followed by select star and a file sink. +-- There is no need for the union optimization, since the whole query can be performed +-- in a single map-only job +-- The final file format is different from the input and intermediate file format. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- on + +-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23) +-- Since this test creates sub-directories for the output table outputTbl1, it might be easier +-- to run the test only on hadoop 23 + +create table inputTbl1(key string, val string) stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@inputTbl1 +PREHOOK: query: create table outputTbl1(key string, values bigint) stored as rcfile +PREHOOK: type: CREATETABLE +POSTHOOK: query: create table outputTbl1(key string, values bigint) stored as rcfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@outputTbl1 +PREHOOK: query: load data local inpath '../data/files/T1.txt' into table inputTbl1 +PREHOOK: type: LOAD +PREHOOK: Output: default@inputtbl1 +POSTHOOK: query: load data local inpath '../data/files/T1.txt' into table inputTbl1 +POSTHOOK: type: LOAD +POSTHOOK: Output: default@inputtbl1 +PREHOOK: query: explain +insert overwrite table outputTbl1 +SELECT * FROM +( +select key, 1 as values from inputTbl1 +union all +select * FROM ( + SELECT key, 2 values from inputTbl1 + UNION ALL + SELECT key, 3 as values from inputTbl1 +) a +)b +PREHOOK: type: QUERY +POSTHOOK: query: explain +insert overwrite table outputTbl1 +SELECT * FROM +( +select key, 1 as values from inputTbl1 +union all +select * FROM ( + SELECT key, 2 values from inputTbl1 + UNION ALL + SELECT key, 3 as values from inputTbl1 +) a +)b +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_UNION (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME inputTbl1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR 1 values)))) (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_UNION (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME inputTbl1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR 2 values)))) (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME inputTbl1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR 3 values))))) a)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF))))) b)) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME outputTbl1))) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-6 depends on stages: Stage-1 , consists of Stage-3, Stage-2, Stage-4 + Stage-3 + Stage-0 depends on stages: Stage-3, Stage-2, Stage-5 + Stage-2 + Stage-4 + Stage-5 depends on stages: Stage-4 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + null-subquery1:b-subquery1:inputtbl1 + TableScan + alias: inputtbl1 + Select Operator + expressions: + expr: key + type: string + expr: 1 + type: int + outputColumnNames: _col0, _col1 + Union + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: int + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: UDFToLong(_col1) + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.outputtbl1 + null-subquery2:b-subquery2-subquery1:a-subquery1:inputtbl1 + TableScan + alias: inputtbl1 + Select Operator + expressions: + expr: key + type: string + expr: 2 + type: int + outputColumnNames: _col0, _col1 + Union + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: int + outputColumnNames: _col0, _col1 + Union + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: int + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: UDFToLong(_col1) + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.outputtbl1 + null-subquery2:b-subquery2-subquery2:a-subquery2:inputtbl1 + TableScan + alias: inputtbl1 + Select Operator + expressions: + expr: key + type: string + expr: 3 + type: int + outputColumnNames: _col0, _col1 + Union + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: int + outputColumnNames: _col0, _col1 + Union + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: int + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: UDFToLong(_col1) + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.outputtbl1 + + Stage: Stage-6 + Conditional Operator + + Stage: Stage-3 + Move Operator + files: + hdfs directory: true +#### A masked pattern was here #### + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.outputtbl1 + + Stage: Stage-2 + Block level merge + + Stage: Stage-4 + Block level merge + + Stage: Stage-5 + Move Operator + files: + hdfs directory: true +#### A masked pattern was here #### + + +PREHOOK: query: insert overwrite table outputTbl1 +SELECT * FROM +( +select key, 1 as values from inputTbl1 +union all +select * FROM ( + SELECT key, 2 as values from inputTbl1 + UNION ALL + SELECT key, 3 as values from inputTbl1 +) a +)b +PREHOOK: type: QUERY +PREHOOK: Input: default@inputtbl1 +PREHOOK: Output: default@outputtbl1 +POSTHOOK: query: insert overwrite table outputTbl1 +SELECT * FROM +( +select key, 1 as values from inputTbl1 +union all +select * FROM ( + SELECT key, 2 as values from inputTbl1 + UNION ALL + SELECT key, 3 as values from inputTbl1 +) a +)b +POSTHOOK: type: QUERY +POSTHOOK: Input: default@inputtbl1 +POSTHOOK: Output: default@outputtbl1 +POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.values EXPRESSION [] +PREHOOK: query: desc formatted outputTbl1 +PREHOOK: type: DESCTABLE +POSTHOOK: query: desc formatted outputTbl1 +POSTHOOK: type: DESCTABLE +POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.values EXPRESSION [] +# col_name data_type comment + +key string None +values bigint None + +# Detailed Table Information +Database: default +#### A masked pattern was here #### +Protect Mode: None +Retention: 0 +#### A masked pattern was here #### +Table Type: MANAGED_TABLE +Table Parameters: +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe +InputFormat: org.apache.hadoop.hive.ql.io.RCFileInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.RCFileOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: select * from outputTbl1 +PREHOOK: type: QUERY +PREHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +POSTHOOK: query: select * from outputTbl1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.values EXPRESSION [] +1 2 +1 1 +1 3 +2 2 +2 1 +2 3 +3 2 +3 1 +3 3 +7 2 +7 1 +7 3 +8 2 +8 1 +8 3 +8 2 +8 1 +8 3 Index: ql/src/test/results/clientpositive/union_remove_9.q.out =================================================================== --- ql/src/test/results/clientpositive/union_remove_9.q.out (revision 0) +++ ql/src/test/results/clientpositive/union_remove_9.q.out (working copy) @@ -0,0 +1,314 @@ +PREHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 2 subqueries is performed (one of which contains a union and is map-only), +-- and the other one is a map-reduce query followed by select star and a file sink. +-- There is no need for the outer union. +-- The final file format is different from the input and intermediate file format. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- on + +-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23) +-- Since this test creates sub-directories for the output table outputTbl1, it might be easier +-- to run the test only on hadoop 23 + +create table inputTbl1(key string, val string) stored as textfile +PREHOOK: type: CREATETABLE +POSTHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 2 subqueries is performed (one of which contains a union and is map-only), +-- and the other one is a map-reduce query followed by select star and a file sink. +-- There is no need for the outer union. +-- The final file format is different from the input and intermediate file format. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- on + +-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23) +-- Since this test creates sub-directories for the output table outputTbl1, it might be easier +-- to run the test only on hadoop 23 + +create table inputTbl1(key string, val string) stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@inputTbl1 +PREHOOK: query: create table outputTbl1(key string, values bigint) stored as rcfile +PREHOOK: type: CREATETABLE +POSTHOOK: query: create table outputTbl1(key string, values bigint) stored as rcfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@outputTbl1 +PREHOOK: query: load data local inpath '../data/files/T1.txt' into table inputTbl1 +PREHOOK: type: LOAD +PREHOOK: Output: default@inputtbl1 +POSTHOOK: query: load data local inpath '../data/files/T1.txt' into table inputTbl1 +POSTHOOK: type: LOAD +POSTHOOK: Output: default@inputtbl1 +PREHOOK: query: explain +insert overwrite table outputTbl1 +SELECT * FROM +( +select key, count(1) as values from inputTbl1 group by key +union all +select * FROM ( + SELECT key, 1 as values from inputTbl1 + UNION ALL + SELECT key, 2 as values from inputTbl1 +) a +)b +PREHOOK: type: QUERY +POSTHOOK: query: explain +insert overwrite table outputTbl1 +SELECT * FROM +( +select key, count(1) as values from inputTbl1 group by key +union all +select * FROM ( + SELECT key, 1 as values from inputTbl1 + UNION ALL + SELECT key, 2 as values from inputTbl1 +) a +)b +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_UNION (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME inputTbl1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count 1) values)) (TOK_GROUPBY (TOK_TABLE_OR_COL key)))) (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_UNION (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME inputTbl1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR 1 values)))) (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME inputTbl1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR 2 values))))) a)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF))))) b)) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME outputTbl1))) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-6 depends on stages: Stage-1, Stage-7 , consists of Stage-3, Stage-2, Stage-4 + Stage-3 + Stage-0 depends on stages: Stage-3, Stage-2, Stage-5 + Stage-2 + Stage-4 + Stage-5 depends on stages: Stage-4 + Stage-7 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + null-subquery2:b-subquery2-subquery1:a-subquery1:inputtbl1 + TableScan + alias: inputtbl1 + Select Operator + expressions: + expr: key + type: string + expr: 1 + type: int + outputColumnNames: _col0, _col1 + Union + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: int + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.outputtbl1 + null-subquery2:b-subquery2-subquery2:a-subquery2:inputtbl1 + TableScan + alias: inputtbl1 + Select Operator + expressions: + expr: key + type: string + expr: 2 + type: int + outputColumnNames: _col0, _col1 + Union + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: int + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.outputtbl1 + + Stage: Stage-6 + Conditional Operator + + Stage: Stage-3 + Move Operator + files: + hdfs directory: true +#### A masked pattern was here #### + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.outputtbl1 + + Stage: Stage-2 + Block level merge + + Stage: Stage-4 + Block level merge + + Stage: Stage-5 + Move Operator + files: + hdfs directory: true +#### A masked pattern was here #### + + Stage: Stage-7 + Map Reduce + Alias -> Map Operator Tree: + null-subquery1:b-subquery1:inputtbl1 + TableScan + alias: inputtbl1 + Select Operator + expressions: + expr: key + type: string + outputColumnNames: key + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: key + type: string + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: -1 + value expressions: + expr: _col1 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: string + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.outputtbl1 + + +PREHOOK: query: insert overwrite table outputTbl1 +SELECT * FROM +( +select key, count(1) as values from inputTbl1 group by key +union all +select * FROM ( + SELECT key, 1 as values from inputTbl1 + UNION ALL + SELECT key, 2 as values from inputTbl1 +) a +)b +PREHOOK: type: QUERY +PREHOOK: Input: default@inputtbl1 +PREHOOK: Output: default@outputtbl1 +POSTHOOK: query: insert overwrite table outputTbl1 +SELECT * FROM +( +select key, count(1) as values from inputTbl1 group by key +union all +select * FROM ( + SELECT key, 1 as values from inputTbl1 + UNION ALL + SELECT key, 2 as values from inputTbl1 +) a +)b +POSTHOOK: type: QUERY +POSTHOOK: Input: default@inputtbl1 +POSTHOOK: Output: default@outputtbl1 +POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.values EXPRESSION [(inputtbl1)inputtbl1.null, ] +PREHOOK: query: desc formatted outputTbl1 +PREHOOK: type: DESCTABLE +POSTHOOK: query: desc formatted outputTbl1 +POSTHOOK: type: DESCTABLE +POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.values EXPRESSION [(inputtbl1)inputtbl1.null, ] +# col_name data_type comment + +key string None +values bigint None + +# Detailed Table Information +Database: default +#### A masked pattern was here #### +Protect Mode: None +Retention: 0 +#### A masked pattern was here #### +Table Type: MANAGED_TABLE +Table Parameters: +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe +InputFormat: org.apache.hadoop.hive.ql.io.RCFileInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.RCFileOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: select * from outputTbl1 +PREHOOK: type: QUERY +PREHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +POSTHOOK: query: select * from outputTbl1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.values EXPRESSION [(inputtbl1)inputtbl1.null, ] +1 1 +1 2 +2 1 +2 2 +3 1 +3 2 +7 1 +7 2 +8 1 +8 2 +8 1 +8 2 +1 1 +2 1 +3 1 +7 1 +8 2 Index: ql/src/test/results/clientpositive/union_remove_4.q.out =================================================================== --- ql/src/test/results/clientpositive/union_remove_4.q.out (revision 0) +++ ql/src/test/results/clientpositive/union_remove_4.q.out (working copy) @@ -0,0 +1,309 @@ +PREHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 2 map-reduce subqueries is performed followed by select star and a file sink +-- There is no need to write the temporary results of the sub-queries, and then read them +-- again to process the union. The union can be removed completely. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- on +-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23) +-- Since this test creates sub-directories for the output table outputTbl1, it might be easier +-- to run the test only on hadoop 23 + +create table inputTbl1(key string, val string) stored as textfile +PREHOOK: type: CREATETABLE +POSTHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 2 map-reduce subqueries is performed followed by select star and a file sink +-- There is no need to write the temporary results of the sub-queries, and then read them +-- again to process the union. The union can be removed completely. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- on +-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23) +-- Since this test creates sub-directories for the output table outputTbl1, it might be easier +-- to run the test only on hadoop 23 + +create table inputTbl1(key string, val string) stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@inputTbl1 +PREHOOK: query: create table outputTbl1(key string, values bigint) stored as textfile +PREHOOK: type: CREATETABLE +POSTHOOK: query: create table outputTbl1(key string, values bigint) stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@outputTbl1 +PREHOOK: query: load data local inpath '../data/files/T1.txt' into table inputTbl1 +PREHOOK: type: LOAD +PREHOOK: Output: default@inputtbl1 +POSTHOOK: query: load data local inpath '../data/files/T1.txt' into table inputTbl1 +POSTHOOK: type: LOAD +POSTHOOK: Output: default@inputtbl1 +PREHOOK: query: explain +insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values from inputTbl1 group by key +) a +PREHOOK: type: QUERY +POSTHOOK: query: explain +insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values from inputTbl1 group by key +) a +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_UNION (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME inputTbl1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count 1) values)) (TOK_GROUPBY (TOK_TABLE_OR_COL key)))) (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME inputTbl1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count 1) values)) (TOK_GROUPBY (TOK_TABLE_OR_COL key))))) a)) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME outputTbl1))) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-6 depends on stages: Stage-1, Stage-7 , consists of Stage-3, Stage-2, Stage-4 + Stage-3 + Stage-0 depends on stages: Stage-3, Stage-2, Stage-5 + Stage-2 + Stage-4 + Stage-5 depends on stages: Stage-4 + Stage-7 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + null-subquery2:a-subquery2:inputtbl1 + TableScan + alias: inputtbl1 + Select Operator + expressions: + expr: key + type: string + outputColumnNames: key + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: key + type: string + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: -1 + value expressions: + expr: _col1 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: string + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + + Stage: Stage-6 + Conditional Operator + + Stage: Stage-3 + Move Operator + files: + hdfs directory: true +#### A masked pattern was here #### + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + + Stage: Stage-4 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + + Stage: Stage-5 + Move Operator + files: + hdfs directory: true +#### A masked pattern was here #### + + Stage: Stage-7 + Map Reduce + Alias -> Map Operator Tree: + null-subquery1:a-subquery1:inputtbl1 + TableScan + alias: inputtbl1 + Select Operator + expressions: + expr: key + type: string + outputColumnNames: key + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: key + type: string + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: -1 + value expressions: + expr: _col1 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: string + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + + +PREHOOK: query: insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values from inputTbl1 group by key +) a +PREHOOK: type: QUERY +PREHOOK: Input: default@inputtbl1 +PREHOOK: Output: default@outputtbl1 +POSTHOOK: query: insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values from inputTbl1 group by key +) a +POSTHOOK: type: QUERY +POSTHOOK: Input: default@inputtbl1 +POSTHOOK: Output: default@outputtbl1 +POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +PREHOOK: query: desc formatted outputTbl1 +PREHOOK: type: DESCTABLE +POSTHOOK: query: desc formatted outputTbl1 +POSTHOOK: type: DESCTABLE +POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +# col_name data_type comment + +key string None +values bigint None + +# Detailed Table Information +Database: default +#### A masked pattern was here #### +Protect Mode: None +Retention: 0 +#### A masked pattern was here #### +Table Type: MANAGED_TABLE +Table Parameters: +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +InputFormat: org.apache.hadoop.mapred.TextInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: select * from outputTbl1 +PREHOOK: type: QUERY +PREHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +POSTHOOK: query: select * from outputTbl1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +1 1 +2 1 +3 1 +7 1 +8 2 +1 1 +2 1 +3 1 +7 1 +8 2 Index: ql/src/test/results/clientpositive/union_remove_18.q.out =================================================================== --- ql/src/test/results/clientpositive/union_remove_18.q.out (revision 0) +++ ql/src/test/results/clientpositive/union_remove_18.q.out (working copy) @@ -0,0 +1,401 @@ +PREHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 2 map-reduce subqueries is performed followed by select star and a file sink +-- There is no need to write the temporary results of the sub-queries, and then read them +-- again to process the union. The union can be removed completely. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- off + +create table inputTbl1(key string, ds string) stored as textfile +PREHOOK: type: CREATETABLE +POSTHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 2 map-reduce subqueries is performed followed by select star and a file sink +-- There is no need to write the temporary results of the sub-queries, and then read them +-- again to process the union. The union can be removed completely. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- off + +create table inputTbl1(key string, ds string) stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@inputTbl1 +PREHOOK: query: create table outputTbl1(key string, values bigint) partitioned by (ds string) stored as textfile +PREHOOK: type: CREATETABLE +POSTHOOK: query: create table outputTbl1(key string, values bigint) partitioned by (ds string) stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@outputTbl1 +PREHOOK: query: load data local inpath '../data/files/T1.txt' into table inputTbl1 +PREHOOK: type: LOAD +PREHOOK: Output: default@inputtbl1 +POSTHOOK: query: load data local inpath '../data/files/T1.txt' into table inputTbl1 +POSTHOOK: type: LOAD +POSTHOOK: Output: default@inputtbl1 +PREHOOK: query: explain +insert overwrite table outputTbl1 partition (ds) +SELECT * +FROM ( + SELECT key, count(1) as values, ds from inputTbl1 group by key, ds + UNION ALL + SELECT key, count(1) as values, ds from inputTbl1 group by key, ds +) a +PREHOOK: type: QUERY +POSTHOOK: query: explain +insert overwrite table outputTbl1 partition (ds) +SELECT * +FROM ( + SELECT key, count(1) as values, ds from inputTbl1 group by key, ds + UNION ALL + SELECT key, count(1) as values, ds from inputTbl1 group by key, ds +) a +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_UNION (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME inputTbl1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count 1) values) (TOK_SELEXPR (TOK_TABLE_OR_COL ds))) (TOK_GROUPBY (TOK_TABLE_OR_COL key) (TOK_TABLE_OR_COL ds)))) (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME inputTbl1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count 1) values) (TOK_SELEXPR (TOK_TABLE_OR_COL ds))) (TOK_GROUPBY (TOK_TABLE_OR_COL key) (TOK_TABLE_OR_COL ds))))) a)) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME outputTbl1) (TOK_PARTSPEC (TOK_PARTVAL ds)))) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1, Stage-2 + Stage-2 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + null-subquery2:a-subquery2:inputtbl1 + TableScan + alias: inputtbl1 + Select Operator + expressions: + expr: key + type: string + expr: ds + type: string + outputColumnNames: key, ds + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: key + type: string + expr: ds + type: string + mode: hash + outputColumnNames: _col0, _col1, _col2 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + expr: _col1 + type: string + sort order: ++ + Map-reduce partition columns: + expr: _col0 + type: string + expr: _col1 + type: string + tag: -1 + value expressions: + expr: _col2 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: string + expr: KEY._col1 + type: string + mode: mergepartial + outputColumnNames: _col0, _col1, _col2 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col2 + type: bigint + expr: _col1 + type: string + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + + Stage: Stage-0 + Move Operator + tables: + partition: + ds + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: + null-subquery1:a-subquery1:inputtbl1 + TableScan + alias: inputtbl1 + Select Operator + expressions: + expr: key + type: string + expr: ds + type: string + outputColumnNames: key, ds + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: key + type: string + expr: ds + type: string + mode: hash + outputColumnNames: _col0, _col1, _col2 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + expr: _col1 + type: string + sort order: ++ + Map-reduce partition columns: + expr: _col0 + type: string + expr: _col1 + type: string + tag: -1 + value expressions: + expr: _col2 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: string + expr: KEY._col1 + type: string + mode: mergepartial + outputColumnNames: _col0, _col1, _col2 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col2 + type: bigint + expr: _col1 + type: string + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + + +PREHOOK: query: insert overwrite table outputTbl1 partition (ds) +SELECT * +FROM ( + SELECT key, count(1) as values, ds from inputTbl1 group by key, ds + UNION ALL + SELECT key, count(1) as values, ds from inputTbl1 group by key, ds +) a +PREHOOK: type: QUERY +PREHOOK: Input: default@inputtbl1 +PREHOOK: Output: default@outputtbl1 +POSTHOOK: query: insert overwrite table outputTbl1 partition (ds) +SELECT * +FROM ( + SELECT key, count(1) as values, ds from inputTbl1 group by key, ds + UNION ALL + SELECT key, count(1) as values, ds from inputTbl1 group by key, ds +) a +POSTHOOK: type: QUERY +POSTHOOK: Input: default@inputtbl1 +POSTHOOK: Output: default@outputtbl1@ds=11 +POSTHOOK: Output: default@outputtbl1@ds=12 +POSTHOOK: Output: default@outputtbl1@ds=13 +POSTHOOK: Output: default@outputtbl1@ds=17 +POSTHOOK: Output: default@outputtbl1@ds=18 +POSTHOOK: Output: default@outputtbl1@ds=28 +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=11).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=11).values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=12).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=12).values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=13).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=13).values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=17).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=17).values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=18).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=18).values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=28).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=28).values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +PREHOOK: query: desc formatted outputTbl1 +PREHOOK: type: DESCTABLE +POSTHOOK: query: desc formatted outputTbl1 +POSTHOOK: type: DESCTABLE +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=11).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=11).values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=12).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=12).values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=13).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=13).values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=17).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=17).values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=18).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=18).values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=28).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=28).values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +# col_name data_type comment + +key string None +values bigint None + +# Partition Information +# col_name data_type comment + +ds string None + +# Detailed Table Information +Database: default +#### A masked pattern was here #### +Protect Mode: None +Retention: 0 +#### A masked pattern was here #### +Table Type: MANAGED_TABLE +Table Parameters: +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +InputFormat: org.apache.hadoop.mapred.TextInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: show partitions outputTbl1 +PREHOOK: type: SHOWPARTITIONS +POSTHOOK: query: show partitions outputTbl1 +POSTHOOK: type: SHOWPARTITIONS +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=11).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=11).values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=12).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=12).values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=13).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=13).values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=17).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=17).values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=18).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=18).values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=28).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=28).values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +ds=11 +ds=12 +ds=13 +ds=17 +ds=18 +ds=28 +PREHOOK: query: select * from outputTbl1 where ds = '11' +PREHOOK: type: QUERY +PREHOOK: Input: default@outputtbl1@ds=11 +#### A masked pattern was here #### +POSTHOOK: query: select * from outputTbl1 where ds = '11' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@outputtbl1@ds=11 +#### A masked pattern was here #### +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=11).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=11).values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=12).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=12).values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=13).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=13).values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=17).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=17).values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=18).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=18).values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=28).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=28).values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +1 1 11 +1 1 11 +PREHOOK: query: select * from outputTbl1 where ds = '18' +PREHOOK: type: QUERY +PREHOOK: Input: default@outputtbl1@ds=18 +#### A masked pattern was here #### +POSTHOOK: query: select * from outputTbl1 where ds = '18' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@outputtbl1@ds=18 +#### A masked pattern was here #### +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=11).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=11).values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=12).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=12).values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=13).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=13).values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=17).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=17).values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=18).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=18).values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=28).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=28).values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +8 1 18 +8 1 18 +PREHOOK: query: select * from outputTbl1 where ds is not null +PREHOOK: type: QUERY +PREHOOK: Input: default@outputtbl1@ds=11 +PREHOOK: Input: default@outputtbl1@ds=12 +PREHOOK: Input: default@outputtbl1@ds=13 +PREHOOK: Input: default@outputtbl1@ds=17 +PREHOOK: Input: default@outputtbl1@ds=18 +PREHOOK: Input: default@outputtbl1@ds=28 +#### A masked pattern was here #### +POSTHOOK: query: select * from outputTbl1 where ds is not null +POSTHOOK: type: QUERY +POSTHOOK: Input: default@outputtbl1@ds=11 +POSTHOOK: Input: default@outputtbl1@ds=12 +POSTHOOK: Input: default@outputtbl1@ds=13 +POSTHOOK: Input: default@outputtbl1@ds=17 +POSTHOOK: Input: default@outputtbl1@ds=18 +POSTHOOK: Input: default@outputtbl1@ds=28 +#### A masked pattern was here #### +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=11).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=11).values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=12).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=12).values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=13).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=13).values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=17).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=17).values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=18).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=18).values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=28).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=28).values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +1 1 11 +1 1 11 +2 1 12 +2 1 12 +3 1 13 +3 1 13 +7 1 17 +7 1 17 +8 1 18 +8 1 18 +8 1 28 +8 1 28 Index: ql/src/test/results/clientpositive/union_remove_13.q.out =================================================================== --- ql/src/test/results/clientpositive/union_remove_13.q.out (revision 0) +++ ql/src/test/results/clientpositive/union_remove_13.q.out (working copy) @@ -0,0 +1,371 @@ +PREHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 2 subqueries is performed (one of which is a mapred query, and the +-- other one is a map-join query), followed by select star and a file sink. +-- The union selectstar optimization should be performed, and the union should be removed. + +-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23) +-- Since this test creates sub-directories for the output table outputTbl1, it might be easier +-- to run the test only on hadoop 23 + +-- The final file format is different from the input and intermediate file format. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- on + +create table inputTbl1(key string, val string) stored as textfile +PREHOOK: type: CREATETABLE +POSTHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 2 subqueries is performed (one of which is a mapred query, and the +-- other one is a map-join query), followed by select star and a file sink. +-- The union selectstar optimization should be performed, and the union should be removed. + +-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23) +-- Since this test creates sub-directories for the output table outputTbl1, it might be easier +-- to run the test only on hadoop 23 + +-- The final file format is different from the input and intermediate file format. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- on + +create table inputTbl1(key string, val string) stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@inputTbl1 +PREHOOK: query: create table outputTbl1(key string, values bigint) stored as rcfile +PREHOOK: type: CREATETABLE +POSTHOOK: query: create table outputTbl1(key string, values bigint) stored as rcfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@outputTbl1 +PREHOOK: query: load data local inpath '../data/files/T1.txt' into table inputTbl1 +PREHOOK: type: LOAD +PREHOOK: Output: default@inputtbl1 +POSTHOOK: query: load data local inpath '../data/files/T1.txt' into table inputTbl1 +POSTHOOK: type: LOAD +POSTHOOK: Output: default@inputtbl1 +PREHOOK: query: explain +insert overwrite table outputTbl1 +SELECT * FROM +( +select key, count(1) as values from inputTbl1 group by key +union all +select /*+ mapjoin(a) */ a.key as key, b.val as values +FROM inputTbl1 a join inputTbl1 b on a.key=b.key +)c +PREHOOK: type: QUERY +POSTHOOK: query: explain +insert overwrite table outputTbl1 +SELECT * FROM +( +select key, count(1) as values from inputTbl1 group by key +union all +select /*+ mapjoin(a) */ a.key as key, b.val as values +FROM inputTbl1 a join inputTbl1 b on a.key=b.key +)c +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_UNION (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME inputTbl1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count 1) values)) (TOK_GROUPBY (TOK_TABLE_OR_COL key)))) (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME inputTbl1) a) (TOK_TABREF (TOK_TABNAME inputTbl1) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_HINTLIST (TOK_HINT TOK_MAPJOIN (TOK_HINTARGLIST a))) (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key) key) (TOK_SELEXPR (. (TOK_TABLE_OR_COL b) val) values))))) c)) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME outputTbl1))) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)))) + +STAGE DEPENDENCIES: + Stage-10 is a root stage + Stage-3 depends on stages: Stage-2, Stage-10 + Stage-8 depends on stages: Stage-3 , consists of Stage-5, Stage-4, Stage-6 + Stage-5 + Stage-0 depends on stages: Stage-5, Stage-4, Stage-7 + Stage-4 + Stage-6 + Stage-7 depends on stages: Stage-6 + Stage-11 is a root stage + Stage-1 depends on stages: Stage-11 + Stage-2 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-10 + Map Reduce + Alias -> Map Operator Tree: + null-subquery1:c-subquery1:inputtbl1 + TableScan + alias: inputtbl1 + Select Operator + expressions: + expr: key + type: string + outputColumnNames: key + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: key + type: string + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: -1 + value expressions: + expr: _col1 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: string + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-3 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + TableScan + Union + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: string + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: UDFToLong(_col1) + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.outputtbl1 +#### A masked pattern was here #### + TableScan + Union + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: string + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: UDFToLong(_col1) + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.outputtbl1 + + Stage: Stage-8 + Conditional Operator + + Stage: Stage-5 + Move Operator + files: + hdfs directory: true +#### A masked pattern was here #### + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.outputtbl1 + + Stage: Stage-4 + Block level merge + + Stage: Stage-6 + Block level merge + + Stage: Stage-7 + Move Operator + files: + hdfs directory: true +#### A masked pattern was here #### + + Stage: Stage-11 + Map Reduce Local Work + Alias -> Map Local Tables: + null-subquery2:c-subquery2:a + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + null-subquery2:c-subquery2:a + TableScan + alias: a + HashTable Sink Operator + condition expressions: + 0 {key} + 1 {val} + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 1 + + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + null-subquery2:c-subquery2:b + TableScan + alias: b + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} + 1 {val} + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + outputColumnNames: _col0, _col5 + Position of Big Table: 1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + Local Work: + Map Reduce Local Work + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + Select Operator + expressions: + expr: _col0 + type: string + expr: _col5 + type: string + outputColumnNames: _col0, _col5 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col5 + type: string + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + +PREHOOK: query: insert overwrite table outputTbl1 +SELECT * FROM +( +select key, count(1) as values from inputTbl1 group by key +union all +select /*+ mapjoin(a) */ a.key as key, b.val as values +FROM inputTbl1 a join inputTbl1 b on a.key=b.key +)c +PREHOOK: type: QUERY +PREHOOK: Input: default@inputtbl1 +PREHOOK: Output: default@outputtbl1 +POSTHOOK: query: insert overwrite table outputTbl1 +SELECT * FROM +( +select key, count(1) as values from inputTbl1 group by key +union all +select /*+ mapjoin(a) */ a.key as key, b.val as values +FROM inputTbl1 a join inputTbl1 b on a.key=b.key +)c +POSTHOOK: type: QUERY +POSTHOOK: Input: default@inputtbl1 +POSTHOOK: Output: default@outputtbl1 +POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(inputtbl1)a.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.values EXPRESSION [(inputtbl1)b.FieldSchema(name:val, type:string, comment:null), (inputtbl1)inputtbl1.null, ] +PREHOOK: query: desc formatted outputTbl1 +PREHOOK: type: DESCTABLE +POSTHOOK: query: desc formatted outputTbl1 +POSTHOOK: type: DESCTABLE +POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(inputtbl1)a.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.values EXPRESSION [(inputtbl1)b.FieldSchema(name:val, type:string, comment:null), (inputtbl1)inputtbl1.null, ] +# col_name data_type comment + +key string None +values bigint None + +# Detailed Table Information +Database: default +#### A masked pattern was here #### +Protect Mode: None +Retention: 0 +#### A masked pattern was here #### +Table Type: MANAGED_TABLE +Table Parameters: +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe +InputFormat: org.apache.hadoop.hive.ql.io.RCFileInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.RCFileOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: select * from outputTbl1 +PREHOOK: type: QUERY +PREHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +POSTHOOK: query: select * from outputTbl1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(inputtbl1)a.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.values EXPRESSION [(inputtbl1)b.FieldSchema(name:val, type:string, comment:null), (inputtbl1)inputtbl1.null, ] +1 11 +2 12 +3 13 +7 17 +8 18 +8 18 +8 28 +8 28 +1 NULL +2 NULL +3 NULL +7 NULL +8 NULL Index: ql/src/test/results/clientpositive/union_remove_6.q.out =================================================================== --- ql/src/test/results/clientpositive/union_remove_6.q.out (revision 0) +++ ql/src/test/results/clientpositive/union_remove_6.q.out (working copy) @@ -0,0 +1,337 @@ +PREHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 2 subqueries is performed (all of which are mapred queries) +-- followed by select star and a file sink in 2 output tables. +-- The optimiaztion does not take affect since it is a multi-table insert. +-- It does not matter, whether the output is merged or not. In this case, +-- merging is turned off + +create table inputTbl1(key string, val string) stored as textfile +PREHOOK: type: CREATETABLE +POSTHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 2 subqueries is performed (all of which are mapred queries) +-- followed by select star and a file sink in 2 output tables. +-- The optimiaztion does not take affect since it is a multi-table insert. +-- It does not matter, whether the output is merged or not. In this case, +-- merging is turned off + +create table inputTbl1(key string, val string) stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@inputTbl1 +PREHOOK: query: create table outputTbl1(key string, values bigint) stored as textfile +PREHOOK: type: CREATETABLE +POSTHOOK: query: create table outputTbl1(key string, values bigint) stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@outputTbl1 +PREHOOK: query: create table outputTbl2(key string, values bigint) stored as textfile +PREHOOK: type: CREATETABLE +POSTHOOK: query: create table outputTbl2(key string, values bigint) stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@outputTbl2 +PREHOOK: query: load data local inpath '../data/files/T1.txt' into table inputTbl1 +PREHOOK: type: LOAD +PREHOOK: Output: default@inputtbl1 +POSTHOOK: query: load data local inpath '../data/files/T1.txt' into table inputTbl1 +POSTHOOK: type: LOAD +POSTHOOK: Output: default@inputtbl1 +PREHOOK: query: explain +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values from inputTbl1 group by key +) a +insert overwrite table outputTbl1 select * +insert overwrite table outputTbl2 select * +PREHOOK: type: QUERY +POSTHOOK: query: explain +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values from inputTbl1 group by key +) a +insert overwrite table outputTbl1 select * +insert overwrite table outputTbl2 select * +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_UNION (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME inputTbl1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count 1) values)) (TOK_GROUPBY (TOK_TABLE_OR_COL key)))) (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME inputTbl1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count 1) values)) (TOK_GROUPBY (TOK_TABLE_OR_COL key))))) a)) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME outputTbl1))) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF))) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME outputTbl2))) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)))) + +STAGE DEPENDENCIES: + Stage-2 is a root stage + Stage-3 depends on stages: Stage-2, Stage-4 + Stage-0 depends on stages: Stage-3 + Stage-1 depends on stages: Stage-3 + Stage-4 is a root stage + +STAGE PLANS: + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: + null-subquery2:a-subquery2:inputtbl1 + TableScan + alias: inputtbl1 + Select Operator + expressions: + expr: key + type: string + outputColumnNames: key + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: key + type: string + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: -1 + value expressions: + expr: _col1 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: string + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-3 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + TableScan + Union + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 2 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl2 +#### A masked pattern was here #### + TableScan + Union + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 2 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl2 + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + + Stage: Stage-1 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl2 + + Stage: Stage-4 + Map Reduce + Alias -> Map Operator Tree: + null-subquery1:a-subquery1:inputtbl1 + TableScan + alias: inputtbl1 + Select Operator + expressions: + expr: key + type: string + outputColumnNames: key + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: key + type: string + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: -1 + value expressions: + expr: _col1 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: string + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + +PREHOOK: query: FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values from inputTbl1 group by key +) a +insert overwrite table outputTbl1 select * +insert overwrite table outputTbl2 select * +PREHOOK: type: QUERY +PREHOOK: Input: default@inputtbl1 +PREHOOK: Output: default@outputtbl1 +PREHOOK: Output: default@outputtbl2 +POSTHOOK: query: FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values from inputTbl1 group by key +) a +insert overwrite table outputTbl1 select * +insert overwrite table outputTbl2 select * +POSTHOOK: type: QUERY +POSTHOOK: Input: default@inputtbl1 +POSTHOOK: Output: default@outputtbl1 +POSTHOOK: Output: default@outputtbl2 +POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +POSTHOOK: Lineage: outputtbl2.key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl2.values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +PREHOOK: query: select * from outputTbl1 +PREHOOK: type: QUERY +PREHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +POSTHOOK: query: select * from outputTbl1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +POSTHOOK: Lineage: outputtbl2.key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl2.values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +1 1 +2 1 +3 1 +7 1 +8 2 +1 1 +2 1 +3 1 +7 1 +8 2 +PREHOOK: query: select * from outputTbl2 +PREHOOK: type: QUERY +PREHOOK: Input: default@outputtbl2 +#### A masked pattern was here #### +POSTHOOK: query: select * from outputTbl2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@outputtbl2 +#### A masked pattern was here #### +POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +POSTHOOK: Lineage: outputtbl2.key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl2.values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +1 1 +2 1 +3 1 +7 1 +8 2 +1 1 +2 1 +3 1 +7 1 +8 2 Index: ql/src/test/results/clientpositive/union_remove_1.q.out =================================================================== --- ql/src/test/results/clientpositive/union_remove_1.q.out (revision 0) +++ ql/src/test/results/clientpositive/union_remove_1.q.out (working copy) @@ -0,0 +1,263 @@ +PREHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 2 map-reduce subqueries is performed followed by select star and a file sink +-- There is no need to write the temporary results of the sub-queries, and then read them +-- again to process the union. The union can be removed completely. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- off +-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23) +-- Since this test creates sub-directories for the output table outputTbl1, it might be easier +-- to run the test only on hadoop 23 + +create table inputTbl1(key string, val string) stored as textfile +PREHOOK: type: CREATETABLE +POSTHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 2 map-reduce subqueries is performed followed by select star and a file sink +-- There is no need to write the temporary results of the sub-queries, and then read them +-- again to process the union. The union can be removed completely. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- off +-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23) +-- Since this test creates sub-directories for the output table outputTbl1, it might be easier +-- to run the test only on hadoop 23 + +create table inputTbl1(key string, val string) stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@inputTbl1 +PREHOOK: query: create table outputTbl1(key string, values bigint) stored as textfile +PREHOOK: type: CREATETABLE +POSTHOOK: query: create table outputTbl1(key string, values bigint) stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@outputTbl1 +PREHOOK: query: load data local inpath '../data/files/T1.txt' into table inputTbl1 +PREHOOK: type: LOAD +PREHOOK: Output: default@inputtbl1 +POSTHOOK: query: load data local inpath '../data/files/T1.txt' into table inputTbl1 +POSTHOOK: type: LOAD +POSTHOOK: Output: default@inputtbl1 +PREHOOK: query: explain +insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values from inputTbl1 group by key +) a +PREHOOK: type: QUERY +POSTHOOK: query: explain +insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values from inputTbl1 group by key +) a +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_UNION (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME inputTbl1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count 1) values)) (TOK_GROUPBY (TOK_TABLE_OR_COL key)))) (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME inputTbl1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count 1) values)) (TOK_GROUPBY (TOK_TABLE_OR_COL key))))) a)) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME outputTbl1))) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1, Stage-2 + Stage-2 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + null-subquery2:a-subquery2:inputtbl1 + TableScan + alias: inputtbl1 + Select Operator + expressions: + expr: key + type: string + outputColumnNames: key + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: key + type: string + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: -1 + value expressions: + expr: _col1 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: string + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: + null-subquery1:a-subquery1:inputtbl1 + TableScan + alias: inputtbl1 + Select Operator + expressions: + expr: key + type: string + outputColumnNames: key + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: key + type: string + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: -1 + value expressions: + expr: _col1 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: string + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + + +PREHOOK: query: insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values from inputTbl1 group by key +) a +PREHOOK: type: QUERY +PREHOOK: Input: default@inputtbl1 +PREHOOK: Output: default@outputtbl1 +POSTHOOK: query: insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values from inputTbl1 group by key +) a +POSTHOOK: type: QUERY +POSTHOOK: Input: default@inputtbl1 +POSTHOOK: Output: default@outputtbl1 +POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +PREHOOK: query: desc formatted outputTbl1 +PREHOOK: type: DESCTABLE +POSTHOOK: query: desc formatted outputTbl1 +POSTHOOK: type: DESCTABLE +POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +# col_name data_type comment + +key string None +values bigint None + +# Detailed Table Information +Database: default +#### A masked pattern was here #### +Protect Mode: None +Retention: 0 +#### A masked pattern was here #### +Table Type: MANAGED_TABLE +Table Parameters: +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +InputFormat: org.apache.hadoop.mapred.TextInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: select * from outputTbl1 +PREHOOK: type: QUERY +PREHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +POSTHOOK: query: select * from outputTbl1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +1 1 +2 1 +3 1 +7 1 +8 2 +1 1 +2 1 +3 1 +7 1 +8 2 Index: ql/src/test/results/clientpositive/union_remove_15.q.out =================================================================== --- ql/src/test/results/clientpositive/union_remove_15.q.out (revision 0) +++ ql/src/test/results/clientpositive/union_remove_15.q.out (working copy) @@ -0,0 +1,309 @@ +PREHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 2 map-reduce subqueries is performed followed by select star and a file sink +-- and the results are written to a table using dynamic partitions. +-- There is no need to write the temporary results of the sub-queries, and then read them +-- again to process the union. The union can be removed completely. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- off +-- Curently, this optimization does not work in the presence of dynamic partitions. + +-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23) +-- Since this test creates sub-directories for the output table outputTbl1, it might be easier +-- to run the test only on hadoop 23 + +create table inputTbl1(key string, val string) stored as textfile +PREHOOK: type: CREATETABLE +POSTHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 2 map-reduce subqueries is performed followed by select star and a file sink +-- and the results are written to a table using dynamic partitions. +-- There is no need to write the temporary results of the sub-queries, and then read them +-- again to process the union. The union can be removed completely. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- off +-- Curently, this optimization does not work in the presence of dynamic partitions. + +-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23) +-- Since this test creates sub-directories for the output table outputTbl1, it might be easier +-- to run the test only on hadoop 23 + +create table inputTbl1(key string, val string) stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@inputTbl1 +PREHOOK: query: create table outputTbl1(key string, values bigint) partitioned by (ds string) stored as rcfile +PREHOOK: type: CREATETABLE +POSTHOOK: query: create table outputTbl1(key string, values bigint) partitioned by (ds string) stored as rcfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@outputTbl1 +PREHOOK: query: load data local inpath '../data/files/T1.txt' into table inputTbl1 +PREHOOK: type: LOAD +PREHOOK: Output: default@inputtbl1 +POSTHOOK: query: load data local inpath '../data/files/T1.txt' into table inputTbl1 +POSTHOOK: type: LOAD +POSTHOOK: Output: default@inputtbl1 +PREHOOK: query: explain +insert overwrite table outputTbl1 partition (ds) +SELECT * +FROM ( + SELECT key, count(1) as values, '1' as ds from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values, '2' as ds from inputTbl1 group by key +) a +PREHOOK: type: QUERY +POSTHOOK: query: explain +insert overwrite table outputTbl1 partition (ds) +SELECT * +FROM ( + SELECT key, count(1) as values, '1' as ds from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values, '2' as ds from inputTbl1 group by key +) a +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_UNION (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME inputTbl1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count 1) values) (TOK_SELEXPR '1' ds)) (TOK_GROUPBY (TOK_TABLE_OR_COL key)))) (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME inputTbl1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count 1) values) (TOK_SELEXPR '2' ds)) (TOK_GROUPBY (TOK_TABLE_OR_COL key))))) a)) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME outputTbl1) (TOK_PARTSPEC (TOK_PARTVAL ds)))) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1, Stage-2 + Stage-2 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + null-subquery2:a-subquery2:inputtbl1 + TableScan + alias: inputtbl1 + Select Operator + expressions: + expr: key + type: string + outputColumnNames: key + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: key + type: string + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: -1 + value expressions: + expr: _col1 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: string + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + expr: '2' + type: string + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.outputtbl1 + + Stage: Stage-0 + Move Operator + tables: + partition: + ds + replace: true + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.outputtbl1 + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: + null-subquery1:a-subquery1:inputtbl1 + TableScan + alias: inputtbl1 + Select Operator + expressions: + expr: key + type: string + outputColumnNames: key + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: key + type: string + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: -1 + value expressions: + expr: _col1 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: string + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + expr: '1' + type: string + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.outputtbl1 + + +PREHOOK: query: insert overwrite table outputTbl1 partition (ds) +SELECT * +FROM ( + SELECT key, count(1) as values, '1' as ds from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values, '2' as ds from inputTbl1 group by key +) a +PREHOOK: type: QUERY +PREHOOK: Input: default@inputtbl1 +PREHOOK: Output: default@outputtbl1 +POSTHOOK: query: insert overwrite table outputTbl1 partition (ds) +SELECT * +FROM ( + SELECT key, count(1) as values, '1' as ds from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values, '2' as ds from inputTbl1 group by key +) a +POSTHOOK: type: QUERY +POSTHOOK: Input: default@inputtbl1 +POSTHOOK: Output: default@outputtbl1@ds=1 +POSTHOOK: Output: default@outputtbl1@ds=2 +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=1).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=1).values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=2).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=2).values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +PREHOOK: query: desc formatted outputTbl1 +PREHOOK: type: DESCTABLE +POSTHOOK: query: desc formatted outputTbl1 +POSTHOOK: type: DESCTABLE +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=1).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=1).values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=2).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=2).values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +# col_name data_type comment + +key string None +values bigint None + +# Partition Information +# col_name data_type comment + +ds string None + +# Detailed Table Information +Database: default +#### A masked pattern was here #### +Protect Mode: None +Retention: 0 +#### A masked pattern was here #### +Table Type: MANAGED_TABLE +Table Parameters: +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe +InputFormat: org.apache.hadoop.hive.ql.io.RCFileInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.RCFileOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: show partitions outputTbl1 +PREHOOK: type: SHOWPARTITIONS +POSTHOOK: query: show partitions outputTbl1 +POSTHOOK: type: SHOWPARTITIONS +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=1).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=1).values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=2).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=2).values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +ds=1 +ds=2 +PREHOOK: query: select * from outputTbl1 where ds = '1' +PREHOOK: type: QUERY +PREHOOK: Input: default@outputtbl1@ds=1 +#### A masked pattern was here #### +POSTHOOK: query: select * from outputTbl1 where ds = '1' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@outputtbl1@ds=1 +#### A masked pattern was here #### +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=1).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=1).values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=2).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=2).values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +1 1 1 +2 1 1 +3 1 1 +7 1 1 +8 2 1 +PREHOOK: query: select * from outputTbl1 where ds = '2' +PREHOOK: type: QUERY +PREHOOK: Input: default@outputtbl1@ds=2 +#### A masked pattern was here #### +POSTHOOK: query: select * from outputTbl1 where ds = '2' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@outputtbl1@ds=2 +#### A masked pattern was here #### +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=1).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=1).values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=2).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=2).values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +1 1 2 +2 1 2 +3 1 2 +7 1 2 +8 2 2 Index: ql/src/test/results/clientpositive/union_remove_10.q.out =================================================================== --- ql/src/test/results/clientpositive/union_remove_10.q.out (revision 0) +++ ql/src/test/results/clientpositive/union_remove_10.q.out (working copy) @@ -0,0 +1,331 @@ +PREHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 2 subqueries is performed (one of which is a map-only query, and the +-- other one contains a nested union where one of the sub-queries requires a map-reduce +-- job), followed by select star and a file sink. +-- There is no need to write the temporary results of the sub-queries, and then read them +-- again to process the union. The outer union can be removed completely. +-- The final file format is different from the input and intermediate file format. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- on + +-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23) +-- Since this test creates sub-directories for the output table outputTbl1, it might be easier +-- to run the test only on hadoop 23 + +create table inputTbl1(key string, val string) stored as textfile +PREHOOK: type: CREATETABLE +POSTHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 2 subqueries is performed (one of which is a map-only query, and the +-- other one contains a nested union where one of the sub-queries requires a map-reduce +-- job), followed by select star and a file sink. +-- There is no need to write the temporary results of the sub-queries, and then read them +-- again to process the union. The outer union can be removed completely. +-- The final file format is different from the input and intermediate file format. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- on + +-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23) +-- Since this test creates sub-directories for the output table outputTbl1, it might be easier +-- to run the test only on hadoop 23 + +create table inputTbl1(key string, val string) stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@inputTbl1 +PREHOOK: query: create table outputTbl1(key string, values bigint) stored as rcfile +PREHOOK: type: CREATETABLE +POSTHOOK: query: create table outputTbl1(key string, values bigint) stored as rcfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@outputTbl1 +PREHOOK: query: load data local inpath '../data/files/T1.txt' into table inputTbl1 +PREHOOK: type: LOAD +PREHOOK: Output: default@inputtbl1 +POSTHOOK: query: load data local inpath '../data/files/T1.txt' into table inputTbl1 +POSTHOOK: type: LOAD +POSTHOOK: Output: default@inputtbl1 +PREHOOK: query: explain +insert overwrite table outputTbl1 +SELECT * FROM +( +select key, 1 as values from inputTbl1 +union all +select * FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, 2 as values from inputTbl1 +) a +)b +PREHOOK: type: QUERY +POSTHOOK: query: explain +insert overwrite table outputTbl1 +SELECT * FROM +( +select key, 1 as values from inputTbl1 +union all +select * FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, 2 as values from inputTbl1 +) a +)b +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_UNION (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME inputTbl1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR 1 values)))) (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_UNION (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME inputTbl1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count 1) values)) (TOK_GROUPBY (TOK_TABLE_OR_COL key)))) (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME inputTbl1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR 2 values))))) a)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF))))) b)) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME outputTbl1))) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-7 depends on stages: Stage-2, Stage-8 , consists of Stage-4, Stage-3, Stage-5 + Stage-4 + Stage-0 depends on stages: Stage-4, Stage-3, Stage-6 + Stage-3 + Stage-5 + Stage-6 depends on stages: Stage-5 + Stage-8 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + null-subquery2:b-subquery2-subquery1:a-subquery1:inputtbl1 + TableScan + alias: inputtbl1 + Select Operator + expressions: + expr: key + type: string + outputColumnNames: key + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: key + type: string + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: -1 + value expressions: + expr: _col1 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: string + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + TableScan + Union + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.outputtbl1 + null-subquery2:b-subquery2-subquery2:a-subquery2:inputtbl1 + TableScan + alias: inputtbl1 + Select Operator + expressions: + expr: key + type: string + expr: 2 + type: int + outputColumnNames: _col0, _col1 + Union + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.outputtbl1 + + Stage: Stage-7 + Conditional Operator + + Stage: Stage-4 + Move Operator + files: + hdfs directory: true +#### A masked pattern was here #### + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.outputtbl1 + + Stage: Stage-3 + Block level merge + + Stage: Stage-5 + Block level merge + + Stage: Stage-6 + Move Operator + files: + hdfs directory: true +#### A masked pattern was here #### + + Stage: Stage-8 + Map Reduce + Alias -> Map Operator Tree: + null-subquery1:b-subquery1:inputtbl1 + TableScan + alias: inputtbl1 + Select Operator + expressions: + expr: key + type: string + expr: 1 + type: int + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.outputtbl1 + + +PREHOOK: query: insert overwrite table outputTbl1 +SELECT * FROM +( +select key, 1 as values from inputTbl1 +union all +select * FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, 2 as values from inputTbl1 +) a +)b +PREHOOK: type: QUERY +PREHOOK: Input: default@inputtbl1 +PREHOOK: Output: default@outputtbl1 +POSTHOOK: query: insert overwrite table outputTbl1 +SELECT * FROM +( +select key, 1 as values from inputTbl1 +union all +select * FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, 2 as values from inputTbl1 +) a +)b +POSTHOOK: type: QUERY +POSTHOOK: Input: default@inputtbl1 +POSTHOOK: Output: default@outputtbl1 +POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.values EXPRESSION [(inputtbl1)inputtbl1.null, ] +PREHOOK: query: desc formatted outputTbl1 +PREHOOK: type: DESCTABLE +POSTHOOK: query: desc formatted outputTbl1 +POSTHOOK: type: DESCTABLE +POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.values EXPRESSION [(inputtbl1)inputtbl1.null, ] +# col_name data_type comment + +key string None +values bigint None + +# Detailed Table Information +Database: default +#### A masked pattern was here #### +Protect Mode: None +Retention: 0 +#### A masked pattern was here #### +Table Type: MANAGED_TABLE +Table Parameters: +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe +InputFormat: org.apache.hadoop.hive.ql.io.RCFileInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.RCFileOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: select * from outputTbl1 +PREHOOK: type: QUERY +PREHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +POSTHOOK: query: select * from outputTbl1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.values EXPRESSION [(inputtbl1)inputtbl1.null, ] +1 1 +2 1 +3 1 +7 1 +8 1 +8 1 +1 1 +2 1 +3 1 +7 1 +8 2 +1 2 +2 2 +3 2 +7 2 +8 2 +8 2 Index: ql/src/test/results/clientpositive/union_remove_8.q.out =================================================================== --- ql/src/test/results/clientpositive/union_remove_8.q.out (revision 0) +++ ql/src/test/results/clientpositive/union_remove_8.q.out (working copy) @@ -0,0 +1,271 @@ +PREHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 3 subqueries is performed (exactly one of which requires a map-reduce job) +-- followed by select star and a file sink. +-- There is no need to write the temporary results of the sub-queries, and then read them +-- again to process the union. The union can be removed completely. +-- The final file format is different from the input and intermediate file format. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- off + +-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23) +-- Since this test creates sub-directories for the output table outputTbl1, it might be easier +-- to run the test only on hadoop 23 + +create table inputTbl1(key string, val string) stored as textfile +PREHOOK: type: CREATETABLE +POSTHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 3 subqueries is performed (exactly one of which requires a map-reduce job) +-- followed by select star and a file sink. +-- There is no need to write the temporary results of the sub-queries, and then read them +-- again to process the union. The union can be removed completely. +-- The final file format is different from the input and intermediate file format. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- off + +-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23) +-- Since this test creates sub-directories for the output table outputTbl1, it might be easier +-- to run the test only on hadoop 23 + +create table inputTbl1(key string, val string) stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@inputTbl1 +PREHOOK: query: create table outputTbl1(key string, values bigint) stored as rcfile +PREHOOK: type: CREATETABLE +POSTHOOK: query: create table outputTbl1(key string, values bigint) stored as rcfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@outputTbl1 +PREHOOK: query: load data local inpath '../data/files/T1.txt' into table inputTbl1 +PREHOOK: type: LOAD +PREHOOK: Output: default@inputtbl1 +POSTHOOK: query: load data local inpath '../data/files/T1.txt' into table inputTbl1 +POSTHOOK: type: LOAD +POSTHOOK: Output: default@inputtbl1 +PREHOOK: query: explain +insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, 1 as values from inputTbl1 + UNION ALL + SELECT key, 2 as values from inputTbl1 +) a +PREHOOK: type: QUERY +POSTHOOK: query: explain +insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, 1 as values from inputTbl1 + UNION ALL + SELECT key, 2 as values from inputTbl1 +) a +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_UNION (TOK_UNION (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME inputTbl1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count 1) values)) (TOK_GROUPBY (TOK_TABLE_OR_COL key)))) (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME inputTbl1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR 1 values))))) (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME inputTbl1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR 2 values))))) a)) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME outputTbl1))) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1, Stage-2, Stage-3 + Stage-2 is a root stage + Stage-3 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + null-subquery2:a-subquery2:inputtbl1 + TableScan + alias: inputtbl1 + Select Operator + expressions: + expr: key + type: string + expr: 2 + type: int + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.outputtbl1 + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.outputtbl1 + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: + null-subquery1-subquery1:a-subquery1-subquery1:inputtbl1 + TableScan + alias: inputtbl1 + Select Operator + expressions: + expr: key + type: string + outputColumnNames: key + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: key + type: string + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: -1 + value expressions: + expr: _col1 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: string + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.outputtbl1 + + Stage: Stage-3 + Map Reduce + Alias -> Map Operator Tree: + null-subquery1-subquery2:a-subquery1-subquery2:inputtbl1 + TableScan + alias: inputtbl1 + Select Operator + expressions: + expr: key + type: string + expr: 1 + type: int + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.outputtbl1 + + +PREHOOK: query: insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, 1 as values from inputTbl1 + UNION ALL + SELECT key, 2 as values from inputTbl1 +) a +PREHOOK: type: QUERY +PREHOOK: Input: default@inputtbl1 +PREHOOK: Output: default@outputtbl1 +POSTHOOK: query: insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, 1 as values from inputTbl1 + UNION ALL + SELECT key, 2 as values from inputTbl1 +) a +POSTHOOK: type: QUERY +POSTHOOK: Input: default@inputtbl1 +POSTHOOK: Output: default@outputtbl1 +POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.values EXPRESSION [(inputtbl1)inputtbl1.null, ] +PREHOOK: query: desc formatted outputTbl1 +PREHOOK: type: DESCTABLE +POSTHOOK: query: desc formatted outputTbl1 +POSTHOOK: type: DESCTABLE +POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.values EXPRESSION [(inputtbl1)inputtbl1.null, ] +# col_name data_type comment + +key string None +values bigint None + +# Detailed Table Information +Database: default +#### A masked pattern was here #### +Protect Mode: None +Retention: 0 +#### A masked pattern was here #### +Table Type: MANAGED_TABLE +Table Parameters: +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe +InputFormat: org.apache.hadoop.hive.ql.io.RCFileInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.RCFileOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: select * from outputTbl1 +PREHOOK: type: QUERY +PREHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +POSTHOOK: query: select * from outputTbl1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.values EXPRESSION [(inputtbl1)inputtbl1.null, ] +1 2 +2 2 +3 2 +7 2 +8 2 +8 2 +1 1 +2 1 +3 1 +7 1 +8 2 +1 1 +2 1 +3 1 +7 1 +8 1 +8 1 Index: ql/src/test/queries/clientpositive/union_remove_11.q =================================================================== --- ql/src/test/queries/clientpositive/union_remove_11.q (revision 0) +++ ql/src/test/queries/clientpositive/union_remove_11.q (working copy) @@ -0,0 +1,55 @@ +set hive.stats.autogather=false; +set hive.optimize.union.selectstar=true; + +set hive.merge.mapfiles=true; +set hive.merge.mapredfiles=true; +set hive.merge.smallfiles.avgsize=1; +set mapred.input.dir.recursive=true; + +-- This is to test the union->selectstar->filesink optimization +-- Union of 2 subqueries is performed (one of which is a map-only query, and the +-- other one contains a nested union where also contains map only sub-queries), +-- followed by select star and a file sink. +-- There is no need for the union optimization, since the whole query can be performed +-- in a single map-only job +-- The final file format is different from the input and intermediate file format. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- on + +-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23) +-- Since this test creates sub-directories for the output table outputTbl1, it might be easier +-- to run the test only on hadoop 23 + +create table inputTbl1(key string, val string) stored as textfile; +create table outputTbl1(key string, values bigint) stored as rcfile; + +load data local inpath '../data/files/T1.txt' into table inputTbl1; + +explain +insert overwrite table outputTbl1 +SELECT * FROM +( +select key, 1 as values from inputTbl1 +union all +select * FROM ( + SELECT key, 2 values from inputTbl1 + UNION ALL + SELECT key, 3 as values from inputTbl1 +) a +)b; + +insert overwrite table outputTbl1 +SELECT * FROM +( +select key, 1 as values from inputTbl1 +union all +select * FROM ( + SELECT key, 2 as values from inputTbl1 + UNION ALL + SELECT key, 3 as values from inputTbl1 +) a +)b; + +desc formatted outputTbl1; + +select * from outputTbl1; Index: ql/src/test/queries/clientpositive/union_remove_6.q =================================================================== --- ql/src/test/queries/clientpositive/union_remove_6.q (revision 0) +++ ql/src/test/queries/clientpositive/union_remove_6.q (working copy) @@ -0,0 +1,39 @@ +set hive.stats.autogather=false; +set hive.optimize.union.selectstar=true; + +set hive.merge.mapfiles=false; +set hive.merge.mapredfiles=false; +set mapred.input.dir.recursive=true; + +-- This is to test the union->selectstar->filesink optimization +-- Union of 2 subqueries is performed (all of which are mapred queries) +-- followed by select star and a file sink in 2 output tables. +-- The optimiaztion does not take affect since it is a multi-table insert. +-- It does not matter, whether the output is merged or not. In this case, +-- merging is turned off + +create table inputTbl1(key string, val string) stored as textfile; +create table outputTbl1(key string, values bigint) stored as textfile; +create table outputTbl2(key string, values bigint) stored as textfile; + +load data local inpath '../data/files/T1.txt' into table inputTbl1; + +explain +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values from inputTbl1 group by key +) a +insert overwrite table outputTbl1 select * +insert overwrite table outputTbl2 select *; + +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values from inputTbl1 group by key +) a +insert overwrite table outputTbl1 select * +insert overwrite table outputTbl2 select *; + +select * from outputTbl1; +select * from outputTbl2; Index: ql/src/test/queries/clientpositive/union_remove_13.q =================================================================== --- ql/src/test/queries/clientpositive/union_remove_13.q (revision 0) +++ ql/src/test/queries/clientpositive/union_remove_13.q (working copy) @@ -0,0 +1,48 @@ +set hive.stats.autogather=false; +set hive.optimize.union.selectstar=true; + +set hive.merge.mapfiles=true; +set hive.merge.mapredfiles=true; +set hive.merge.smallfiles.avgsize=1; +set mapred.input.dir.recursive=true; + +-- This is to test the union->selectstar->filesink optimization +-- Union of 2 subqueries is performed (one of which is a mapred query, and the +-- other one is a map-join query), followed by select star and a file sink. +-- The union selectstar optimization should be performed, and the union should be removed. + +-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23) +-- Since this test creates sub-directories for the output table outputTbl1, it might be easier +-- to run the test only on hadoop 23 + +-- The final file format is different from the input and intermediate file format. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- on + +create table inputTbl1(key string, val string) stored as textfile; +create table outputTbl1(key string, values bigint) stored as rcfile; + +load data local inpath '../data/files/T1.txt' into table inputTbl1; + +explain +insert overwrite table outputTbl1 +SELECT * FROM +( +select key, count(1) as values from inputTbl1 group by key +union all +select /*+ mapjoin(a) */ a.key as key, b.val as values +FROM inputTbl1 a join inputTbl1 b on a.key=b.key +)c; + +insert overwrite table outputTbl1 +SELECT * FROM +( +select key, count(1) as values from inputTbl1 group by key +union all +select /*+ mapjoin(a) */ a.key as key, b.val as values +FROM inputTbl1 a join inputTbl1 b on a.key=b.key +)c; + +desc formatted outputTbl1; + +select * from outputTbl1; Index: ql/src/test/queries/clientpositive/union_remove_8.q =================================================================== --- ql/src/test/queries/clientpositive/union_remove_8.q (revision 0) +++ ql/src/test/queries/clientpositive/union_remove_8.q (working copy) @@ -0,0 +1,49 @@ +set hive.stats.autogather=false; +set hive.optimize.union.selectstar=true; + +set hive.merge.mapfiles=false; +set hive.merge.mapredfiles=false; +set mapred.input.dir.recursive=true; + +-- This is to test the union->selectstar->filesink optimization +-- Union of 3 subqueries is performed (exactly one of which requires a map-reduce job) +-- followed by select star and a file sink. +-- There is no need to write the temporary results of the sub-queries, and then read them +-- again to process the union. The union can be removed completely. +-- The final file format is different from the input and intermediate file format. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- off + +-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23) +-- Since this test creates sub-directories for the output table outputTbl1, it might be easier +-- to run the test only on hadoop 23 + +create table inputTbl1(key string, val string) stored as textfile; +create table outputTbl1(key string, values bigint) stored as rcfile; + +load data local inpath '../data/files/T1.txt' into table inputTbl1; + +explain +insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, 1 as values from inputTbl1 + UNION ALL + SELECT key, 2 as values from inputTbl1 +) a; + +insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, 1 as values from inputTbl1 + UNION ALL + SELECT key, 2 as values from inputTbl1 +) a; + +desc formatted outputTbl1; + +select * from outputTbl1; Index: ql/src/test/queries/clientpositive/union_remove_15.q =================================================================== --- ql/src/test/queries/clientpositive/union_remove_15.q (revision 0) +++ ql/src/test/queries/clientpositive/union_remove_15.q (working copy) @@ -0,0 +1,51 @@ +set hive.stats.autogather=false; +set hive.optimize.union.selectstar=true; + +set hive.merge.mapfiles=false; +set hive.merge.mapredfiles=false; + +set hive.exec.dynamic.partition.mode=nonstrict; +set hive.exec.dynamic.partition=true; +set mapred.input.dir.recursive=true; + +-- This is to test the union->selectstar->filesink optimization +-- Union of 2 map-reduce subqueries is performed followed by select star and a file sink +-- and the results are written to a table using dynamic partitions. +-- There is no need to write the temporary results of the sub-queries, and then read them +-- again to process the union. The union can be removed completely. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- off +-- Curently, this optimization does not work in the presence of dynamic partitions. + +-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23) +-- Since this test creates sub-directories for the output table outputTbl1, it might be easier +-- to run the test only on hadoop 23 + +create table inputTbl1(key string, val string) stored as textfile; +create table outputTbl1(key string, values bigint) partitioned by (ds string) stored as rcfile; + +load data local inpath '../data/files/T1.txt' into table inputTbl1; + +explain +insert overwrite table outputTbl1 partition (ds) +SELECT * +FROM ( + SELECT key, count(1) as values, '1' as ds from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values, '2' as ds from inputTbl1 group by key +) a; + +insert overwrite table outputTbl1 partition (ds) +SELECT * +FROM ( + SELECT key, count(1) as values, '1' as ds from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values, '2' as ds from inputTbl1 group by key +) a; + +desc formatted outputTbl1; + +show partitions outputTbl1; + +select * from outputTbl1 where ds = '1'; +select * from outputTbl1 where ds = '2'; Index: ql/src/test/queries/clientpositive/union_remove_17.q =================================================================== --- ql/src/test/queries/clientpositive/union_remove_17.q (revision 0) +++ ql/src/test/queries/clientpositive/union_remove_17.q (working copy) @@ -0,0 +1,47 @@ +set hive.stats.autogather=false; +set hive.optimize.union.selectstar=true; + +set hive.merge.mapfiles=false; +set hive.merge.mapredfiles=false; + +set hive.exec.dynamic.partition.mode=nonstrict; +set hive.exec.dynamic.partition=true; +set mapred.input.dir.recursive=true; + +-- This is to test the union->selectstar->filesink optimization +-- Union of 2 map-reduce subqueries is performed followed by select star and a file sink +-- and the results are written to a table using dynamic partitions. +-- There is no need for this optimization, since the query is a map-only query. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- off +-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23) +-- Since this test creates sub-directories for the output table outputTbl1, it might be easier +-- to run the test only on hadoop 23 + +create table inputTbl1(key string, val string) stored as textfile; +create table outputTbl1(key string, values bigint) partitioned by (ds string) stored as rcfile; + +load data local inpath '../data/files/T1.txt' into table inputTbl1; + +explain +insert overwrite table outputTbl1 partition (ds) +SELECT * +FROM ( + SELECT key, 1 as values, '1' as ds from inputTbl1 + UNION ALL + SELECT key, 2 as values, '2' as ds from inputTbl1 +) a; + +insert overwrite table outputTbl1 partition (ds) +SELECT * +FROM ( + SELECT key, 1 as values, '1' as ds from inputTbl1 + UNION ALL + SELECT key, 2 as values, '2' as ds from inputTbl1 +) a; + +desc formatted outputTbl1; +show partitions outputTbl1; + +select * from outputTbl1 where ds = '1'; +select * from outputTbl1 where ds = '2'; Index: ql/src/test/queries/clientpositive/union_remove_1.q =================================================================== --- ql/src/test/queries/clientpositive/union_remove_1.q (revision 0) +++ ql/src/test/queries/clientpositive/union_remove_1.q (working copy) @@ -0,0 +1,42 @@ +set hive.stats.autogather=false; +set hive.optimize.union.selectstar=true; + +set hive.merge.mapfiles=false; +set hive.merge.mapredfiles=false; +set mapred.input.dir.recursive=true; + +-- This is to test the union->selectstar->filesink optimization +-- Union of 2 map-reduce subqueries is performed followed by select star and a file sink +-- There is no need to write the temporary results of the sub-queries, and then read them +-- again to process the union. The union can be removed completely. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- off +-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23) +-- Since this test creates sub-directories for the output table outputTbl1, it might be easier +-- to run the test only on hadoop 23 + +create table inputTbl1(key string, val string) stored as textfile; +create table outputTbl1(key string, values bigint) stored as textfile; + +load data local inpath '../data/files/T1.txt' into table inputTbl1; + +explain +insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values from inputTbl1 group by key +) a; + +insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values from inputTbl1 group by key +) a; + +desc formatted outputTbl1; + +select * from outputTbl1; Index: ql/src/test/queries/clientpositive/union_remove_3.q =================================================================== --- ql/src/test/queries/clientpositive/union_remove_3.q (revision 0) +++ ql/src/test/queries/clientpositive/union_remove_3.q (working copy) @@ -0,0 +1,47 @@ +set hive.stats.autogather=false; +set hive.optimize.union.selectstar=true; + +set hive.merge.mapfiles=false; +set hive.merge.mapredfiles=false; +set mapred.input.dir.recursive=true; + +-- This is to test the union->selectstar->filesink optimization +-- Union of 3 subqueries is performed (all of which are map-only queries) +-- followed by select star and a file sink. +-- There is no need for any optimization, since the whole query can be processed in +-- a single map-only job +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- off +-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23) +-- Since this test creates sub-directories for the output table outputTbl1, it might be easier +-- to run the test only on hadoop 23 + +create table inputTbl1(key string, val string) stored as textfile; +create table outputTbl1(key string, values bigint) stored as textfile; + +load data local inpath '../data/files/T1.txt' into table inputTbl1; + +explain +insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, 1 as values from inputTbl1 + UNION ALL + SELECT key, 2 as values from inputTbl1 + UNION ALL + SELECT key, 3 as values from inputTbl1 +) a; + +insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, 1 as values from inputTbl1 + UNION ALL + SELECT key, 2 as values from inputTbl1 + UNION ALL + SELECT key, 3 as values from inputTbl1 +) a; + +desc formatted outputTbl1; + +select * from outputTbl1; Index: ql/src/test/queries/clientpositive/union_remove_10.q =================================================================== --- ql/src/test/queries/clientpositive/union_remove_10.q (revision 0) +++ ql/src/test/queries/clientpositive/union_remove_10.q (working copy) @@ -0,0 +1,55 @@ +set hive.stats.autogather=false; +set hive.optimize.union.selectstar=true; + +set hive.merge.mapfiles=true; +set hive.merge.mapredfiles=true; +set hive.merge.smallfiles.avgsize=1; +set mapred.input.dir.recursive=true; + +-- This is to test the union->selectstar->filesink optimization +-- Union of 2 subqueries is performed (one of which is a map-only query, and the +-- other one contains a nested union where one of the sub-queries requires a map-reduce +-- job), followed by select star and a file sink. +-- There is no need to write the temporary results of the sub-queries, and then read them +-- again to process the union. The outer union can be removed completely. +-- The final file format is different from the input and intermediate file format. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- on + +-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23) +-- Since this test creates sub-directories for the output table outputTbl1, it might be easier +-- to run the test only on hadoop 23 + +create table inputTbl1(key string, val string) stored as textfile; +create table outputTbl1(key string, values bigint) stored as rcfile; + +load data local inpath '../data/files/T1.txt' into table inputTbl1; + +explain +insert overwrite table outputTbl1 +SELECT * FROM +( +select key, 1 as values from inputTbl1 +union all +select * FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, 2 as values from inputTbl1 +) a +)b; + +insert overwrite table outputTbl1 +SELECT * FROM +( +select key, 1 as values from inputTbl1 +union all +select * FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, 2 as values from inputTbl1 +) a +)b; + +desc formatted outputTbl1; + +select * from outputTbl1; Index: ql/src/test/queries/clientpositive/union_remove_5.q =================================================================== --- ql/src/test/queries/clientpositive/union_remove_5.q (revision 0) +++ ql/src/test/queries/clientpositive/union_remove_5.q (working copy) @@ -0,0 +1,49 @@ +set hive.stats.autogather=false; +set hive.optimize.union.selectstar=true; + +set hive.merge.mapfiles=true; +set hive.merge.mapredfiles=true; +set hive.merge.smallfiles.avgsize=1; +set mapred.input.dir.recursive=true; + +-- This is to test the union->selectstar->filesink optimization +-- Union of 3 subqueries is performed (exactly one of which requires a map-reduce job) +-- followed by select star and a file sink. +-- There is no need to write the temporary results of the sub-queries, and then read them +-- again to process the union. The union can be removed completely. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- on + +-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23) +-- Since this test creates sub-directories for the output table outputTbl1, it might be easier +-- to run the test only on hadoop 23 + +create table inputTbl1(key string, val string) stored as textfile; +create table outputTbl1(key string, values bigint) stored as textfile; + +load data local inpath '../data/files/T1.txt' into table inputTbl1; + +explain +insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, 1 as values from inputTbl1 + UNION ALL + SELECT key, 2 as values from inputTbl1 +) a; + +insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, 1 as values from inputTbl1 + UNION ALL + SELECT key, 2 as values from inputTbl1 +) a; + +desc formatted outputTbl1; + +select * from outputTbl1; Index: ql/src/test/queries/clientpositive/union_remove_12.q =================================================================== --- ql/src/test/queries/clientpositive/union_remove_12.q (revision 0) +++ ql/src/test/queries/clientpositive/union_remove_12.q (working copy) @@ -0,0 +1,49 @@ +set hive.stats.autogather=false; +set hive.optimize.union.selectstar=true; + +set hive.merge.mapfiles=true; +set hive.merge.mapredfiles=true; +set hive.merge.smallfiles.avgsize=1; +set mapred.input.dir.recursive=true; + +-- This is to test the union->selectstar->filesink optimization +-- Union of 2 subqueries is performed (one of which is a map-only query, and the +-- other one is a map-join query), followed by select star and a file sink. +-- There is no need for the union optimization, since the whole query can be performed +-- in a single map-only job + +-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23) +-- Since this test creates sub-directories for the output table outputTbl1, it might be easier +-- to run the test only on hadoop 23 + +-- The final file format is different from the input and intermediate file format. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- on + +create table inputTbl1(key string, val string) stored as textfile; +create table outputTbl1(key string, values bigint) stored as rcfile; + +load data local inpath '../data/files/T1.txt' into table inputTbl1; + +explain +insert overwrite table outputTbl1 +SELECT * FROM +( +select key, 1 as values from inputTbl1 +union all +select /*+ mapjoin(a) */ a.key as key, b.val as values +FROM inputTbl1 a join inputTbl1 b on a.key=b.key +)c; + +insert overwrite table outputTbl1 +SELECT * FROM +( +select key, 1 as values from inputTbl1 +union all +select /*+ mapjoin(a) */ a.key as key, b.val as values +FROM inputTbl1 a join inputTbl1 b on a.key=b.key +)c; + +desc formatted outputTbl1; + +select * from outputTbl1; Index: ql/src/test/queries/clientpositive/union_remove_7.q =================================================================== --- ql/src/test/queries/clientpositive/union_remove_7.q (revision 0) +++ ql/src/test/queries/clientpositive/union_remove_7.q (working copy) @@ -0,0 +1,44 @@ +set hive.stats.autogather=false; +set hive.optimize.union.selectstar=true; + +set hive.merge.mapfiles=false; +set hive.merge.mapredfiles=false; +set mapred.input.dir.recursive=true; + +-- This is to test the union->selectstar->filesink optimization +-- Union of 2 map-reduce subqueries is performed followed by select star and a file sink +-- There is no need to write the temporary results of the sub-queries, and then read them +-- again to process the union. The union can be removed completely. +-- The final file format is different from the input and intermediate file format. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- off + +-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23) +-- Since this test creates sub-directories for the output table outputTbl1, it might be easier +-- to run the test only on hadoop 23 + +create table inputTbl1(key string, val string) stored as textfile; +create table outputTbl1(key string, values bigint) stored as rcfile; + +load data local inpath '../data/files/T1.txt' into table inputTbl1; + +explain +insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values from inputTbl1 group by key +) a; + +insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values from inputTbl1 group by key +) a; + +desc formatted outputTbl1; + +select * from outputTbl1; Index: ql/src/test/queries/clientpositive/union_remove_14.q =================================================================== --- ql/src/test/queries/clientpositive/union_remove_14.q (revision 0) +++ ql/src/test/queries/clientpositive/union_remove_14.q (working copy) @@ -0,0 +1,50 @@ +set hive.stats.autogather=false; +set hive.optimize.union.selectstar=true; + +set hive.merge.mapfiles=true; +set hive.merge.mapredfiles=true; +set hive.auto.convert.join=true; +set hive.merge.smallfiles.avgsize=1; +set mapred.input.dir.recursive=true; + +-- This is to test the union->selectstar->filesink optimization +-- Union of 2 subqueries is performed (one of which is a map-only query, and the +-- other one contains a join, which should be performed as a map-join query at runtime), +-- followed by select star and a file sink. +-- The union selectstar optimization should be performed, and the union should be removed. + +-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23) +-- Since this test creates sub-directories for the output table outputTbl1, it might be easier +-- to run the test only on hadoop 23 + +-- The final file format is different from the input and intermediate file format. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- on + +create table inputTbl1(key string, val string) stored as textfile; +create table outputTbl1(key string, values bigint) stored as rcfile; + +load data local inpath '../data/files/T1.txt' into table inputTbl1; + +explain +insert overwrite table outputTbl1 +SELECT * FROM +( +select key, 1 as values from inputTbl1 +union all +select a.key as key, b.val as values +FROM inputTbl1 a join inputTbl1 b on a.key=b.key +)c; + +insert overwrite table outputTbl1 +SELECT * FROM +( +select key, 1 as values from inputTbl1 +union all +select a.key as key, b.val as values +FROM inputTbl1 a join inputTbl1 b on a.key=b.key +)c; + +desc formatted outputTbl1; + +select * from outputTbl1; Index: ql/src/test/queries/clientpositive/union_remove_9.q =================================================================== --- ql/src/test/queries/clientpositive/union_remove_9.q (revision 0) +++ ql/src/test/queries/clientpositive/union_remove_9.q (working copy) @@ -0,0 +1,53 @@ +set hive.stats.autogather=false; +set hive.optimize.union.selectstar=true; + +set hive.merge.mapfiles=true; +set hive.merge.mapredfiles=true; +set hive.merge.smallfiles.avgsize=1; +set mapred.input.dir.recursive=true; + +-- This is to test the union->selectstar->filesink optimization +-- Union of 2 subqueries is performed (one of which contains a union and is map-only), +-- and the other one is a map-reduce query followed by select star and a file sink. +-- There is no need for the outer union. +-- The final file format is different from the input and intermediate file format. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- on + +-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23) +-- Since this test creates sub-directories for the output table outputTbl1, it might be easier +-- to run the test only on hadoop 23 + +create table inputTbl1(key string, val string) stored as textfile; +create table outputTbl1(key string, values bigint) stored as rcfile; + +load data local inpath '../data/files/T1.txt' into table inputTbl1; + +explain +insert overwrite table outputTbl1 +SELECT * FROM +( +select key, count(1) as values from inputTbl1 group by key +union all +select * FROM ( + SELECT key, 1 as values from inputTbl1 + UNION ALL + SELECT key, 2 as values from inputTbl1 +) a +)b; + +insert overwrite table outputTbl1 +SELECT * FROM +( +select key, count(1) as values from inputTbl1 group by key +union all +select * FROM ( + SELECT key, 1 as values from inputTbl1 + UNION ALL + SELECT key, 2 as values from inputTbl1 +) a +)b; + +desc formatted outputTbl1; + +select * from outputTbl1; Index: ql/src/test/queries/clientpositive/union_remove_16.q =================================================================== --- ql/src/test/queries/clientpositive/union_remove_16.q (revision 0) +++ ql/src/test/queries/clientpositive/union_remove_16.q (working copy) @@ -0,0 +1,50 @@ +set hive.stats.autogather=false; +set hive.optimize.union.selectstar=true; + +set hive.merge.mapfiles=true; +set hive.merge.mapredfiles=true; +set hive.merge.smallfiles.avgsize=1; +set mapred.input.dir.recursive=true; + +set hive.exec.dynamic.partition.mode=nonstrict; +set hive.exec.dynamic.partition=true; + +-- This is to test the union->selectstar->filesink optimization +-- Union of 2 map-reduce subqueries is performed followed by select star and a file sink +-- and the results are written to a table using dynamic partitions. +-- There is no need to write the temporary results of the sub-queries, and then read them +-- again to process the union. The union can be removed completely. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- on +-- Curently, this optimization does not work in the presence of dynamic partitions. +-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23) +-- Since this test creates sub-directories for the output table outputTbl1, it might be easier +-- to run the test only on hadoop 23 + +create table inputTbl1(key string, val string) stored as textfile; +create table outputTbl1(key string, values bigint) partitioned by (ds string) stored as rcfile ; + +load data local inpath '../data/files/T1.txt' into table inputTbl1; + +explain +insert overwrite table outputTbl1 partition (ds) +SELECT * +FROM ( + SELECT key, count(1) as values, '1' as ds from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values, '2' as ds from inputTbl1 group by key +) a; + +insert overwrite table outputTbl1 partition (ds) +SELECT * +FROM ( + SELECT key, count(1) as values, '1' as ds from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values, '2' as ds from inputTbl1 group by key +) a; + +desc formatted outputTbl1; +show partitions outputTbl1; + +select * from outputTbl1 where ds = '1'; +select * from outputTbl1 where ds = '2'; Index: ql/src/test/queries/clientpositive/union_remove_18.q =================================================================== --- ql/src/test/queries/clientpositive/union_remove_18.q (revision 0) +++ ql/src/test/queries/clientpositive/union_remove_18.q (working copy) @@ -0,0 +1,45 @@ +set hive.stats.autogather=false; +set hive.optimize.union.selectstar=true; + +set hive.merge.mapfiles=false; +set hive.merge.mapredfiles=false; + +set hive.exec.dynamic.partition.mode=nonstrict; +set hive.exec.dynamic.partition=true; +set mapred.input.dir.recursive=true; + +-- This is to test the union->selectstar->filesink optimization +-- Union of 2 map-reduce subqueries is performed followed by select star and a file sink +-- There is no need to write the temporary results of the sub-queries, and then read them +-- again to process the union. The union can be removed completely. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- off + +create table inputTbl1(key string, ds string) stored as textfile; +create table outputTbl1(key string, values bigint) partitioned by (ds string) stored as textfile; + +load data local inpath '../data/files/T1.txt' into table inputTbl1; + +explain +insert overwrite table outputTbl1 partition (ds) +SELECT * +FROM ( + SELECT key, count(1) as values, ds from inputTbl1 group by key, ds + UNION ALL + SELECT key, count(1) as values, ds from inputTbl1 group by key, ds +) a; + +insert overwrite table outputTbl1 partition (ds) +SELECT * +FROM ( + SELECT key, count(1) as values, ds from inputTbl1 group by key, ds + UNION ALL + SELECT key, count(1) as values, ds from inputTbl1 group by key, ds +) a; + +desc formatted outputTbl1; + +show partitions outputTbl1; +select * from outputTbl1 where ds = '11'; +select * from outputTbl1 where ds = '18'; +select * from outputTbl1 where ds is not null; Index: ql/src/test/queries/clientpositive/union_remove_2.q =================================================================== --- ql/src/test/queries/clientpositive/union_remove_2.q (revision 0) +++ ql/src/test/queries/clientpositive/union_remove_2.q (working copy) @@ -0,0 +1,47 @@ +set hive.stats.autogather=false; +set hive.optimize.union.selectstar=true; + +set hive.merge.mapfiles=false; +set hive.merge.mapredfiles=false; +set mapred.input.dir.recursive=true; + +-- This is to test the union->selectstar->filesink optimization +-- Union of 3 subqueries is performed (exactly one of which requires a map-reduce job) +-- followed by select star and a file sink. +-- There is no need to write the temporary results of the sub-queries, and then read them +-- again to process the union. The union can be removed completely. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- off +-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23) +-- Since this test creates sub-directories for the output table outputTbl1, it might be easier +-- to run the test only on hadoop 23 + +create table inputTbl1(key string, val string) stored as textfile; +create table outputTbl1(key string, values bigint) stored as textfile; + +load data local inpath '../data/files/T1.txt' into table inputTbl1; + +explain +insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, 1 as values from inputTbl1 + UNION ALL + SELECT key, 2 as values from inputTbl1 +) a; + +insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, 1 as values from inputTbl1 + UNION ALL + SELECT key, 2 as values from inputTbl1 +) a; + +desc formatted outputTbl1; + +select * from outputTbl1; Index: ql/src/test/queries/clientpositive/union_remove_4.q =================================================================== --- ql/src/test/queries/clientpositive/union_remove_4.q (revision 0) +++ ql/src/test/queries/clientpositive/union_remove_4.q (working copy) @@ -0,0 +1,43 @@ +set hive.stats.autogather=false; +set hive.optimize.union.selectstar=true; + +set hive.merge.mapfiles=true; +set hive.merge.mapredfiles=true; +set mapred.input.dir.recursive=true; +set hive.merge.smallfiles.avgsize=1; + +-- This is to test the union->selectstar->filesink optimization +-- Union of 2 map-reduce subqueries is performed followed by select star and a file sink +-- There is no need to write the temporary results of the sub-queries, and then read them +-- again to process the union. The union can be removed completely. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- on +-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23) +-- Since this test creates sub-directories for the output table outputTbl1, it might be easier +-- to run the test only on hadoop 23 + +create table inputTbl1(key string, val string) stored as textfile; +create table outputTbl1(key string, values bigint) stored as textfile; + +load data local inpath '../data/files/T1.txt' into table inputTbl1; + +explain +insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values from inputTbl1 group by key +) a; + +insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values from inputTbl1 group by key +) a; + +desc formatted outputTbl1; + +select * from outputTbl1; Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/unionproc/UnionProcFactory.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/unionproc/UnionProcFactory.java (revision 1379405) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/unionproc/UnionProcFactory.java (working copy) @@ -17,16 +17,23 @@ */ package org.apache.hadoop.hive.ql.optimizer.unionproc; +import java.util.ArrayList; import java.util.List; import java.util.Stack; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.ql.exec.FileSinkOperator; import org.apache.hadoop.hive.ql.exec.Operator; +import org.apache.hadoop.hive.ql.exec.OperatorFactory; +import org.apache.hadoop.hive.ql.exec.SelectOperator; import org.apache.hadoop.hive.ql.exec.UnionOperator; import org.apache.hadoop.hive.ql.lib.Node; import org.apache.hadoop.hive.ql.lib.NodeProcessor; import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx; import org.apache.hadoop.hive.ql.optimizer.unionproc.UnionProcContext.UnionParseContext; import org.apache.hadoop.hive.ql.parse.SemanticException; +import org.apache.hadoop.hive.ql.plan.FileSinkDesc; +import org.apache.hadoop.hive.ql.plan.SelectDesc; import org.apache.hadoop.hive.ql.plan.OperatorDesc; /** @@ -173,6 +180,86 @@ } /** + * Union followed by no processing. + * This is to optimize queries of the type: + * select * from (subq1 union all subq2 ...)x; + * where at least one of the queries involve a map-reduce job. + * There is no need for a union in this scenario - it involves an extra + * write and read for the final output without this optimization. + */ + public static class UnionNoProcessFile implements NodeProcessor { + + @Override + public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, + Object... nodeOutputs) throws SemanticException { + FileSinkOperator fileSinkOp = (FileSinkOperator)nd; + + // Has this filesink already been processed + if (fileSinkOp.getConf().isLinkedFileSink()) { + return null; + } + + int size = stack.size(); + SelectOperator select = (SelectOperator)stack.get(size-2); + + // If it is not a select star, dont do anything + SelectDesc selectDesc = select.getConf(); + if (!selectDesc.isSelectStar()) { + return null; + } + + UnionOperator union = (UnionOperator)stack.get(size-3); + + // No need for this optimization in case of multi-table inserts + if (union.getChildOperators().size() > 1) { + return null; + } + + UnionProcContext ctx = (UnionProcContext) procCtx; + UnionParseContext uCtx = ctx.getUnionParseContext(union); + + // No need for this if all sub-queries are map-only queries + if (uCtx.allMapOnlySubQ()) { + return null; + } + + String parentDirName = fileSinkOp.getConf().getDirName(); + + // Clone the fileSinkDesc of the final fileSink and create similar fileSinks at + // each parent + List fileDescLists = new ArrayList(); + + for (Operator parent : union.getParentOperators()) { + FileSinkDesc fileSinkDesc = null; + try { + fileSinkDesc = (FileSinkDesc) fileSinkOp.getConf().clone(); + } catch (CloneNotSupportedException e) { + return null; + } + + String dirName = parentDirName + Path.SEPARATOR + parent.getIdentifier() ; + fileSinkDesc.setDirName(dirName); + fileSinkDesc.setLinkedFileSink(true); + fileSinkDesc.setParentDir(parentDirName); + // If all the sub-queries were map-only, there was no use for this optimization + parent.setChildOperators(null); + Operator tmpFileSinkOp = + OperatorFactory.getAndMakeChild(fileSinkDesc, parent.getSchema(), parent); + tmpFileSinkOp.setChildOperators(null); + fileDescLists.add(fileSinkDesc); + } + + for (FileSinkDesc fileDesc : fileDescLists) { + fileDesc.setLinkedFileSinkDesc(fileDescLists); + } + + // Delink the union as if it was not present in the first place. + union.setParentOperators(null); + return null; + } + } + + /** * Default processor. */ public static class NoUnion implements NodeProcessor { @@ -204,4 +291,7 @@ return new NoUnion(); } + public static NodeProcessor getUnionNoProcessFile() { + return new UnionNoProcessFile(); + } } Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/unionproc/UnionProcessor.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/unionproc/UnionProcessor.java (revision 1379405) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/unionproc/UnionProcessor.java (working copy) @@ -22,6 +22,7 @@ import java.util.LinkedHashMap; import java.util.Map; +import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher; import org.apache.hadoop.hive.ql.lib.Dispatcher; import org.apache.hadoop.hive.ql.lib.GraphWalker; @@ -64,13 +65,18 @@ // the operator stack. Map opRules = new LinkedHashMap(); opRules.put(new RuleRegExp("R1", "RS%.*UNION%"), - UnionProcFactory.getMapRedUnion()); + UnionProcFactory.getMapRedUnion()); opRules.put(new RuleRegExp("R2", "UNION%.*UNION%"), - UnionProcFactory.getUnknownUnion()); + UnionProcFactory.getUnknownUnion()); opRules.put(new RuleRegExp("R3", "TS%.*UNION%"), - UnionProcFactory.getMapUnion()); - opRules.put(new RuleRegExp("R3", "MAPJOIN%.*UNION%"), - UnionProcFactory.getMapJoinUnion()); + UnionProcFactory.getMapUnion()); + opRules.put(new RuleRegExp("R4", "MAPJOIN%.*UNION%"), + UnionProcFactory.getMapJoinUnion()); + HiveConf conf = pCtx.getConf(); + if (conf.getBoolVar(HiveConf.ConfVars.HIVEOPTIMIZEUNIONSELECTSTAR)) { + opRules.put(new RuleRegExp("R5", "UNION%SEL%FS%"), + UnionProcFactory.getUnionNoProcessFile()); + } // The dispatcher fires the processor for the matching rule and passes the // context along Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRProcContext.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRProcContext.java (revision 1379405) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRProcContext.java (working copy) @@ -23,6 +23,7 @@ import java.util.HashMap; import java.util.LinkedHashMap; import java.util.List; +import java.util.Map; import java.util.Set; import org.apache.hadoop.hive.conf.HiveConf; @@ -38,6 +39,7 @@ import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx; import org.apache.hadoop.hive.ql.parse.ParseContext; import org.apache.hadoop.hive.ql.plan.DependencyCollectionWork; +import org.apache.hadoop.hive.ql.plan.FileSinkDesc; import org.apache.hadoop.hive.ql.plan.MapJoinDesc; import org.apache.hadoop.hive.ql.plan.MoveWork; import org.apache.hadoop.hive.ql.plan.OperatorDesc; @@ -253,6 +255,10 @@ private List> rootOps; private DependencyCollectionTask dependencyTaskForMultiInsert; + // If many fileSinkDescs are linked to each other, it is a good idea to keep track of + // tasks for first fileSinkDesc. others can use it + private Map> linkedFileDescTasks; + /** * Set of read entities. This list is generated by the walker and is passed to * the hooks. @@ -314,6 +320,7 @@ unionTaskMap = new HashMap(); mapJoinTaskMap = new HashMap, GenMRMapJoinCtx>(); dependencyTaskForMultiInsert = null; + linkedFileDescTasks = null; } /** @@ -568,4 +575,13 @@ } return dependencyTaskForMultiInsert; } + + public Map> getLinkedFileDescTasks() { + return linkedFileDescTasks; + } + + public void setLinkedFileDescTasks( + Map> linkedFileDescTasks) { + this.linkedFileDescTasks = linkedFileDescTasks; + } } Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRFileSink1.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRFileSink1.java (revision 1379405) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRFileSink1.java (working copy) @@ -23,6 +23,7 @@ import java.util.HashMap; import java.util.LinkedHashMap; import java.util.List; +import java.util.Map; import java.util.Stack; import org.apache.commons.logging.Log; @@ -109,6 +110,16 @@ parseCtx.getQB().getParseInfo().isInsertToTable(); HiveConf hconf = parseCtx.getConf(); + // If this file sink desc has been processed due to a linked file sink desc, + // use that task + Map> fileSinkDescs = ctx.getLinkedFileDescTasks(); + if (fileSinkDescs != null) { + Task childTask = fileSinkDescs.get(fsOp.getConf()); + if (childTask != null) { + processLinkedFileDesc(ctx, childTask); + return null; + } + } // Has the user enabled merging of files for map-only jobs or for all jobs if ((ctx.getMvTask() != null) && (!ctx.getMvTask().isEmpty())) { @@ -129,19 +140,27 @@ } if ((mvTask != null) && !mvTask.isLocal()) { - // There are separate configuration parameters to control whether to - // merge for a map-only job - // or for a map-reduce job - MapredWork currWork = (MapredWork) currTask.getWork(); - boolean mergeMapOnly = - hconf.getBoolVar(HiveConf.ConfVars.HIVEMERGEMAPFILES) && - currWork.getReducer() == null; - boolean mergeMapRed = - hconf.getBoolVar(HiveConf.ConfVars.HIVEMERGEMAPREDFILES) && - currWork.getReducer() != null; - if (mergeMapOnly || mergeMapRed) { - chDir = true; + if (fsOp.getConf().isLinkedFileSink()) { + if (hconf.getBoolVar(HiveConf.ConfVars.HIVEMERGEMAPFILES) || + hconf.getBoolVar(HiveConf.ConfVars.HIVEMERGEMAPREDFILES)) { + chDir = true; + } } + else { + // There are separate configuration parameters to control whether to + // merge for a map-only job + // or for a map-reduce job + MapredWork currWork = (MapredWork) currTask.getWork(); + boolean mergeMapOnly = + hconf.getBoolVar(HiveConf.ConfVars.HIVEMERGEMAPFILES) && + currWork.getReducer() == null; + boolean mergeMapRed = + hconf.getBoolVar(HiveConf.ConfVars.HIVEMERGEMAPREDFILES) && + currWork.getReducer() != null; + if (mergeMapOnly || mergeMapRed) { + chDir = true; + } + } } } } @@ -153,9 +172,54 @@ createMergeJob((FileSinkOperator) nd, ctx, finalName); } + FileSinkDesc fileSinkDesc = fsOp.getConf(); + if (fileSinkDesc.isLinkedFileSink()) { + Map> linkedFileDescTasks = + ctx.getLinkedFileDescTasks(); + if (linkedFileDescTasks == null) { + linkedFileDescTasks = new HashMap>(); + ctx.setLinkedFileDescTasks(linkedFileDescTasks); + } + assert currTask.getChildTasks() != null; + assert currTask.getChildTasks().size() == 1; + + for (FileSinkDesc fileDesc : fileSinkDesc.getLinkedFileSinkDesc()) { + linkedFileDescTasks.put(fileDesc, currTask.getChildTasks().get(0)); + } + + } + return null; } + /* + * Multiple file sink descriptors are linked. + * Use the task created by the first linked file descriptor + */ + private void processLinkedFileDesc(GenMRProcContext ctx, + Task childTask) + throws SemanticException { + Operator currTopOp = ctx.getCurrTopOp(); + String currAliasId = ctx.getCurrAliasId(); + List> seenOps = ctx.getSeenOps(); + List> rootTasks = ctx.getRootTasks(); + Task currTask = ctx.getCurrTask(); + + if (currTopOp != null) { + if (!seenOps.contains(currTopOp)) { + seenOps.add(currTopOp); + GenMapRedUtils.setTaskPlan(currAliasId, currTopOp, + (MapredWork) currTask.getWork(), false, ctx); + } + + if (!rootTasks.contains(currTask)) { + rootTasks.add(currTask); + } + } + + currTask.addDependentTask(childTask); + } + /** * Add the StatsTask as a dependent task of the MoveTask * because StatsTask will change the Table/Partition metadata. For atomicity, we @@ -262,10 +326,10 @@ // since it is unknown if the merge MR will be triggered at execution time. MoveWork dummyMv = new MoveWork(null, null, null, - new LoadFileDesc(fsConf.getDirName(), finalName, true, null, null), false); + new LoadFileDesc(fsConf.getFinalDirName(), finalName, true, null, null), false); ConditionalTask cndTsk = createCondTask(conf, currTask, dummyMv, cplan, - fsConf.getDirName()); + fsConf.getFinalDirName()); linkMoveTask(ctx, newOutput, cndTsk); } @@ -387,7 +451,7 @@ // MapRedTask currTask = (MapRedTask) ctx.getCurrTask(); MoveWork dummyMv = new MoveWork(null, null, null, - new LoadFileDesc(fsInputDesc.getDirName(), finalName, true, null, null), false); + new LoadFileDesc(fsInputDesc.getFinalDirName(), finalName, true, null, null), false); MapredWork cplan; if(parseCtx.getConf().getBoolVar(HiveConf.ConfVars. @@ -417,7 +481,7 @@ // NOTE: we should gather stats in MR1 rather than MR2 at merge job since we don't // know if merge MR2 will be triggered at execution time ConditionalTask cndTsk = createCondTask(ctx.getConf(), ctx.getCurrTask(), dummyMv, cplan, - fsInputDesc.getDirName()); + fsInputDesc.getFinalDirName()); // keep the dynamic partition context in conditional task resolver context ConditionalResolverMergeFilesCtx mrCtx = @@ -516,7 +580,7 @@ FileSinkDesc fsDesc) { ArrayList aliases = new ArrayList(); - String inputDir = fsDesc.getDirName(); + String inputDir = fsDesc.getFinalDirName(); TableDesc tblDesc = fsDesc.getTableInfo(); aliases.add(inputDir); // dummy alias: just use the input path @@ -541,7 +605,7 @@ private MapredWork createRCFileMergeTask(FileSinkDesc fsInputDesc, String finalName, boolean hasDynamicPartitions) throws SemanticException { - String inputDir = fsInputDesc.getDirName(); + String inputDir = fsInputDesc.getFinalDirName(); TableDesc tblDesc = fsInputDesc.getTableInfo(); if(tblDesc.getInputFileFormatClass().equals(RCFileInputFormat.class)) { @@ -637,8 +701,9 @@ srcDir = mvWork.getLoadTableWork().getSourceDir(); } + String fsOpDirName = fsOp.getConf().getFinalDirName(); if ((srcDir != null) - && (srcDir.equalsIgnoreCase(fsOp.getConf().getDirName()))) { + && (srcDir.equalsIgnoreCase(fsOpDirName))) { return mvTsk; } } @@ -681,7 +746,7 @@ String dest = null; if (chDir) { - dest = fsOp.getConf().getDirName(); + dest = fsOp.getConf().getFinalDirName(); // generate the temporary file // it must be on the same file system as the current destination @@ -689,7 +754,17 @@ Context baseCtx = parseCtx.getContext(); String tmpDir = baseCtx.getExternalTmpFileURI((new Path(dest)).toUri()); - fsOp.getConf().setDirName(tmpDir); + FileSinkDesc fileSinkDesc = fsOp.getConf(); + // Change all the linked file sink descriptors + if (fileSinkDesc.isLinkedFileSink()) { + for (FileSinkDesc fsConf:fileSinkDesc.getLinkedFileSinkDesc()) { + String fileName = Utilities.getFileNameFromDirName(fsConf.getDirName()); + fsConf.setParentDir(tmpDir); + fsConf.setDirName(tmpDir + Path.SEPARATOR + fileName); + } + } else { + fileSinkDesc.setDirName(tmpDir); + } } Task mvTask = null; @@ -707,7 +782,6 @@ // Set the move task to be dependent on the current task if (mvTask != null) { - addDependentMoveTasks(ctx, mvTask, currTask); } @@ -742,7 +816,7 @@ } } // mapTask and currTask should be merged by and join/union operator - // (e.g., GenMRUnion1j) which has multiple topOps. + // (e.g., GenMRUnion1) which has multiple topOps. // assert mapTask == currTask : "mapTask.id = " + mapTask.getId() // + "; currTask.id = " + currTask.getId(); } Index: ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java (revision 1379405) +++ ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java (working copy) @@ -31,13 +31,13 @@ import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; +import java.util.HashSet; import java.util.Iterator; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Set; -import java.util.HashSet; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -63,7 +63,6 @@ import org.apache.hadoop.hive.metastore.api.HiveObjectRef; import org.apache.hadoop.hive.metastore.api.HiveObjectType; import org.apache.hadoop.hive.metastore.api.Index; -import org.apache.hadoop.hive.metastore.api.InvalidObjectException; import org.apache.hadoop.hive.metastore.api.InvalidOperationException; import org.apache.hadoop.hive.metastore.api.MetaException; import org.apache.hadoop.hive.metastore.api.NoSuchObjectException; @@ -1194,12 +1193,7 @@ FileStatus[] leafStatus = Utilities.getFileStatusRecurse(loadPath, numDP+1, fs); // Check for empty partitions for (FileStatus s : leafStatus) { - if (s.isDir()) { - // No leaves in this directory - LOG.info("NOT moving empty directory: " + s.getPath()); - } else { - validPartitions.add(s.getPath().getParent()); - } + validPartitions.add(s.getPath().getParent()); } if (validPartitions.size() == 0) { @@ -1868,10 +1862,7 @@ fs.delete(itemStaging, true); continue; } - if (item.isDir()) { - throw new HiveException("checkPaths: " + src.getPath() - + " has nested directory" + itemStaging); - } + if (!replace) { // It's possible that the file we're copying may have the same // relative name as an existing file in the "destf" directory. Index: ql/src/java/org/apache/hadoop/hive/ql/exec/FileSinkOperator.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/exec/FileSinkOperator.java (revision 1379405) +++ ql/src/java/org/apache/hadoop/hive/ql/exec/FileSinkOperator.java (working copy) @@ -77,6 +77,7 @@ protected transient Path parent; protected transient HiveOutputFormat hiveOutputFormat; protected transient Path specPath; + protected transient String childSpecPathDynLinkedPartitions; protected transient int dpStartCol; // start column # for DP columns protected transient List dpVals; // array of values corresponding to DP columns protected transient List dpWritables; @@ -119,16 +120,6 @@ } /** - * Append a subdirectory to the tmp path. - * - * @param dp - * subdirecgtory name - */ - public void appendTmpPath(String dp) { - tmpPath = new Path(tmpPath, dp); - } - - /** * Update OutPath according to tmpPath. */ public Path getTaskOutPath(String taskId) { @@ -282,6 +273,30 @@ private boolean filesCreated = false; + private void initializeSpecPath() { + // For a query of the type: + // insert overwrite table T1 + // select * from (subq1 union all subq2)u; + // subQ1 and subQ2 write to directories Parent/Child_1 and + // Parent/Child_2 respectively, and union is removed. + // The movetask that follows subQ1 and subQ2 tasks moves the directory + // 'Parent' + + // However, if the above query contains dynamic partitions, subQ1 and + // subQ2 has to write to directories: Parent/DynamicPartition/Child_1 + // and Parent/DynamicPartition/Child_1 respectively. + // The movetask that follows subQ1 and subQ2 tasks still moves the directory + // 'Parent' + if ((!conf.isLinkedFileSink()) || (dpCtx == null)) { + specPath = new Path(conf.getDirName()); + childSpecPathDynLinkedPartitions = null; + return; + } + + specPath = new Path(conf.getParentDir()); + childSpecPathDynLinkedPartitions = Utilities.getFileNameFromDirName(conf.getDirName()); + } + @Override protected void initializeOp(Configuration hconf) throws HiveException { try { @@ -294,7 +309,7 @@ dpCtx = conf.getDynPartCtx(); valToPaths = new HashMap(); taskId = Utilities.getTaskId(hconf); - specPath = new Path(conf.getDirName()); + initializeSpecPath(); fs = specPath.getFileSystem(hconf); hiveOutputFormat = conf.getTableInfo().getOutputFileFormatClass().newInstance(); isCompressed = conf.getCompressed(); @@ -625,8 +640,17 @@ LOG.error("Fatal error was thrown due to exceeding number of dynamic partitions"); } fsp2 = new FSPaths(specPath); - fsp2.tmpPath = new Path(fsp2.tmpPath, dpDir); - fsp2.taskOutputTempPath = new Path(fsp2.taskOutputTempPath, dpDir); + if (childSpecPathDynLinkedPartitions != null) { + fsp2.tmpPath = new Path(fsp2.tmpPath, + dpDir + Path.SEPARATOR + childSpecPathDynLinkedPartitions); + fsp2.taskOutputTempPath = + new Path(fsp2.taskOutputTempPath, + dpDir + Path.SEPARATOR + childSpecPathDynLinkedPartitions); + } else { + fsp2.tmpPath = new Path(fsp2.tmpPath, dpDir); + fsp2.taskOutputTempPath = + new Path(fsp2.taskOutputTempPath, dpDir); + } createBucketFiles(fsp2); valToPaths.put(dpDir, fsp2); } @@ -668,7 +692,11 @@ DynamicPartitionCtx dpCtx = conf.getDynPartCtx(); int numDP = dpCtx.getNumDPCols(); FileSystem fs = tmpPath.getFileSystem(jobConf); - FileStatus[] status = Utilities.getFileStatusRecurse(tmpPath, numDP, fs); + int level = numDP; + if (conf.isLinkedFileSink()) { + level++; + } + FileStatus[] status = Utilities.getFileStatusRecurse(tmpPath, level, fs); sb.append("Sample of ") .append(Math.min(status.length, 100)) .append(" partitions created under ") @@ -746,6 +774,9 @@ if ((conf != null) && isNativeTable) { String specPath = conf.getDirName(); DynamicPartitionCtx dpCtx = conf.getDynPartCtx(); + if (conf.isLinkedFileSink() && (dpCtx != null)) { + specPath = conf.getParentDir(); + } Utilities.mvFileToFinalPath(specPath, hconf, success, LOG, dpCtx, conf); } } catch (IOException e) { Index: ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java (revision 1379405) +++ ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java (working copy) @@ -1135,18 +1135,25 @@ // move file by file FileStatus[] files = fs.listStatus(src); for (FileStatus file : files) { + Path srcFilePath = file.getPath(); String fileName = srcFilePath.getName(); Path dstFilePath = new Path(dst, fileName); - if (fs.exists(dstFilePath)) { - int suffix = 0; - do { - suffix++; - dstFilePath = new Path(dst, fileName + "_" + suffix); - } while (fs.exists(dstFilePath)); + if (file.isDir()) { + renameOrMoveFiles(fs, srcFilePath, dstFilePath); } - if (!fs.rename(srcFilePath, dstFilePath)) { - throw new HiveException("Unable to move: " + src + " to: " + dst); + else { + if (fs.exists(dstFilePath)) { + int suffix = 0; + do { + suffix++; + dstFilePath = new Path(dst, fileName + "_" + suffix); + } while (fs.exists(dstFilePath)); + } + + if (!fs.rename(srcFilePath, dstFilePath)) { + throw new HiveException("Unable to move: " + src + " to: " + dst); + } } } } @@ -1206,6 +1213,14 @@ return taskId; } + public static String getFileNameFromDirName(String dirName) { + int dirEnd = dirName.lastIndexOf(Path.SEPARATOR); + if (dirEnd != -1) { + return dirName.substring(dirEnd + 1); + } + return dirName; + } + /** * Replace the task id from the filename. It is assumed that the filename is derived from the * output of getTaskId Index: ql/src/java/org/apache/hadoop/hive/ql/plan/FileSinkDesc.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/plan/FileSinkDesc.java (revision 1379405) +++ ql/src/java/org/apache/hadoop/hive/ql/plan/FileSinkDesc.java (working copy) @@ -19,6 +19,7 @@ package org.apache.hadoop.hive.ql.plan; import java.util.ArrayList; +import java.util.List; import org.apache.hadoop.fs.Path; @@ -45,6 +46,19 @@ private DynamicPartitionCtx dpCtx; private String staticSpec; // static partition spec ends with a '/' private boolean gatherStats; + + // This file descriptor is linked to other file descriptors. + // One use case is that, a union->select (star)->file sink, is broken down. + // For eg: consider a query like: + // select * from (subq1 union all subq2)x; + // where subq1 or subq2 involves a map-reduce job. + // It is broken into two independent queries involving subq1 and subq2 directly, and + // the sub-queries write to sub-directories of a common directory. So, the file sink + // descriptors for subq1 and subq2 are linked. + private boolean linkedFileSink = false; + private String parentDir; + transient private List linkedFileSinkDesc; + private boolean statsReliable; public FileSinkDesc() { @@ -79,6 +93,22 @@ this.partitionCols = null; } + @Override + public Object clone() throws CloneNotSupportedException { + FileSinkDesc ret = new FileSinkDesc(dirName, tableInfo, compressed, + destTableId, multiFileSpray, numFiles, totalFiles, + partitionCols, dpCtx); + ret.setCompressCodec(compressCodec); + ret.setCompressType(compressType); + ret.setGatherStats(gatherStats); + ret.setStaticSpec(staticSpec); + ret.setStatsAggPrefix(statsKeyPref); + ret.setLinkedFileSink(linkedFileSink); + ret.setParentDir(parentDir); + ret.setLinkedFileSinkDesc(linkedFileSinkDesc); + return (Object) ret; + } + @Explain(displayName = "directory", normalExplain = false) public String getDirName() { return dirName; @@ -88,6 +118,10 @@ this.dirName = dirName; } + public String getFinalDirName() { + return linkedFileSink ? parentDir : dirName; + } + @Explain(displayName = "table") public TableDesc getTableInfo() { return tableInfo; @@ -248,6 +282,22 @@ } } + public boolean isLinkedFileSink() { + return linkedFileSink; + } + + public void setLinkedFileSink(boolean linkedFileSink) { + this.linkedFileSink = linkedFileSink; + } + + public String getParentDir() { + return parentDir; + } + + public void setParentDir(String parentDir) { + this.parentDir = parentDir; + } + public boolean isStatsReliable() { return statsReliable; } @@ -255,4 +305,12 @@ public void setStatsReliable(boolean statsReliable) { this.statsReliable = statsReliable; } + + public List getLinkedFileSinkDesc() { + return linkedFileSinkDesc; + } + + public void setLinkedFileSinkDesc(List linkedFileSinkDesc) { + this.linkedFileSinkDesc = linkedFileSinkDesc; + } } Index: ql/src/java/org/apache/hadoop/hive/ql/plan/ConditionalResolverMergeFiles.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/plan/ConditionalResolverMergeFiles.java (revision 1379405) +++ ql/src/java/org/apache/hadoop/hive/ql/plan/ConditionalResolverMergeFiles.java (working copy) @@ -258,6 +258,53 @@ work.setMinSplitSizePerRack(targetSize); } + private static class AverageSize { + private final long totalSize; + private final int numFiles; + + public AverageSize(long totalSize, int numFiles) { + this.totalSize = totalSize; + this.numFiles = numFiles; + } + + public long getTotalSize() { + return totalSize; + } + + public int getNumFiles() { + return numFiles; + } + } + + private AverageSize getAverageSize(FileSystem inpFs, Path dirPath) { + AverageSize dummy = new AverageSize(0, 0); + AverageSize error = new AverageSize(-1, -1); + try { + FileStatus[] fStats = inpFs.listStatus(dirPath); + + long totalSz = 0; + int numFiles = 0; + for (FileStatus fStat : fStats) { + if (fStat.isDir()) { + AverageSize avgSzDir = getAverageSize(inpFs, fStat.getPath()); + if (avgSzDir.getTotalSize() < 0) { + return error; + } + totalSz += avgSzDir.getTotalSize(); + numFiles += avgSzDir.getNumFiles(); + } + else { + totalSz += fStat.getLen(); + numFiles++; + } + } + + return new AverageSize(totalSz, numFiles); + } catch (IOException e) { + return error; + } + } + /** * Whether to merge files inside directory given the threshold of the average file size. * @@ -270,23 +317,18 @@ * This could be true when the table is bucketized and all buckets are empty. */ private long getMergeSize(FileSystem inpFs, Path dirPath, long avgSize) { - try { - FileStatus[] fStats = inpFs.listStatus(dirPath); - if (fStats.length <= 1) { - return -1; - } - long totalSz = 0; - for (FileStatus fStat : fStats) { - totalSz += fStat.getLen(); - } + AverageSize averageSize = getAverageSize(inpFs, dirPath); + if (averageSize.getTotalSize() <= 0) { + return -1; + } - if (totalSz < avgSize * fStats.length) { - return totalSz; - } else { - return -1; - } - } catch (IOException e) { + if (averageSize.getNumFiles() <= 1) { return -1; } + + if (averageSize.getTotalSize()/averageSize.getNumFiles() < avgSize) { + return averageSize.getTotalSize(); + } + return -1; } }