Index: common/src/java/org/apache/hadoop/hive/conf/HiveConf.java =================================================================== --- common/src/java/org/apache/hadoop/hive/conf/HiveConf.java (revision 1372257) +++ common/src/java/org/apache/hadoop/hive/conf/HiveConf.java (working copy) @@ -487,6 +487,10 @@ HIVEOPTBUCKETMAPJOIN("hive.optimize.bucketmapjoin", false), // optimize bucket map join HIVEOPTSORTMERGEBUCKETMAPJOIN("hive.optimize.bucketmapjoin.sortedmerge", false), // try to use sorted merge bucket map join HIVEOPTREDUCEDEDUPLICATION("hive.optimize.reducededuplication", true), + // whether to optimize union followed by select star followed by filesink + // It creates sub-directories in the final output, so should not be turned on in systems + // where MAPREDUCE-1501 is not present + HIVEOPTIMIZEUNIONSELECTSTAR("hive.optimize.union.selectstar", false), // Indexes HIVEOPTINDEXFILTER_COMPACT_MINSIZE("hive.optimize.index.filter.compact.minsize", (long) 5 * 1024 * 1024 * 1024), // 5G Index: ql/src/test/results/clientpositive/union_remove_3.q.out =================================================================== --- ql/src/test/results/clientpositive/union_remove_3.q.out (revision 0) +++ ql/src/test/results/clientpositive/union_remove_3.q.out (working copy) @@ -0,0 +1,284 @@ +PREHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 3 subqueries is performed (all of which are map-only queries) +-- followed by select star and a file sink. +-- There is no need for any optimization, since the whole query can be processed in +-- a single map-only job +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- off + +create table inputTbl1(key string, val string) stored as textfile +PREHOOK: type: CREATETABLE +POSTHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 3 subqueries is performed (all of which are map-only queries) +-- followed by select star and a file sink. +-- There is no need for any optimization, since the whole query can be processed in +-- a single map-only job +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- off + +create table inputTbl1(key string, val string) stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@inputTbl1 +PREHOOK: query: create table outputTbl1(key string, values bigint) stored as textfile +PREHOOK: type: CREATETABLE +POSTHOOK: query: create table outputTbl1(key string, values bigint) stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@outputTbl1 +PREHOOK: query: load data local inpath '../data/files/T1.txt' into table inputTbl1 +PREHOOK: type: LOAD +PREHOOK: Output: default@inputtbl1 +POSTHOOK: query: load data local inpath '../data/files/T1.txt' into table inputTbl1 +POSTHOOK: type: LOAD +POSTHOOK: Output: default@inputtbl1 +PREHOOK: query: explain +insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, 1 as values from inputTbl1 + UNION ALL + SELECT key, 2 as values from inputTbl1 + UNION ALL + SELECT key, 3 as values from inputTbl1 +) a +PREHOOK: type: QUERY +POSTHOOK: query: explain +insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, 1 as values from inputTbl1 + UNION ALL + SELECT key, 2 as values from inputTbl1 + UNION ALL + SELECT key, 3 as values from inputTbl1 +) a +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_UNION (TOK_UNION (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME inputTbl1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR 1 values)))) (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME inputTbl1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR 2 values))))) (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME inputTbl1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR 3 values))))) a)) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME outputTbl1))) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + null-subquery1-subquery1:a-subquery1-subquery1:inputtbl1 + TableScan + alias: inputtbl1 + Select Operator + expressions: + expr: key + type: string + expr: 1 + type: int + outputColumnNames: _col0, _col1 + Union + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: int + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: UDFToLong(_col1) + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + null-subquery1-subquery2:a-subquery1-subquery2:inputtbl1 + TableScan + alias: inputtbl1 + Select Operator + expressions: + expr: key + type: string + expr: 2 + type: int + outputColumnNames: _col0, _col1 + Union + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: int + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: UDFToLong(_col1) + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + null-subquery2:a-subquery2:inputtbl1 + TableScan + alias: inputtbl1 + Select Operator + expressions: + expr: key + type: string + expr: 3 + type: int + outputColumnNames: _col0, _col1 + Union + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: int + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: UDFToLong(_col1) + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + + Stage: Stage-2 + Stats-Aggr Operator + + +PREHOOK: query: insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, 1 as values from inputTbl1 + UNION ALL + SELECT key, 2 as values from inputTbl1 + UNION ALL + SELECT key, 3 as values from inputTbl1 +) a +PREHOOK: type: QUERY +PREHOOK: Input: default@inputtbl1 +PREHOOK: Output: default@outputtbl1 +POSTHOOK: query: insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, 1 as values from inputTbl1 + UNION ALL + SELECT key, 2 as values from inputTbl1 + UNION ALL + SELECT key, 3 as values from inputTbl1 +) a +POSTHOOK: type: QUERY +POSTHOOK: Input: default@inputtbl1 +POSTHOOK: Output: default@outputtbl1 +POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.values EXPRESSION [] +PREHOOK: query: desc formatted outputTbl1 +PREHOOK: type: DESCTABLE +POSTHOOK: query: desc formatted outputTbl1 +POSTHOOK: type: DESCTABLE +POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.values EXPRESSION [] +# col_name data_type comment + +key string None +values bigint None + +# Detailed Table Information +Database: default +#### A masked pattern was here #### +Protect Mode: None +Retention: 0 +#### A masked pattern was here #### +Table Type: MANAGED_TABLE +Table Parameters: + numFiles 1 + numPartitions 0 + numRows 18 + rawDataSize 54 + totalSize 72 +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +InputFormat: org.apache.hadoop.mapred.TextInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: -- Ideally, there should be a query to execute a map-reduce job on outputTbl1. +-- something like, select count(1) from outputTbl1 +-- However, that requires the following +-- MAPREDUCE-1501 with mapred.input.dir.recursive set to true +-- This is not present in hadoop 20. +-- However, when HIVE-3344 is fixed, this test can only be run on hadoop 23 + +select * from outputTbl1 +PREHOOK: type: QUERY +PREHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +POSTHOOK: query: -- Ideally, there should be a query to execute a map-reduce job on outputTbl1. +-- something like, select count(1) from outputTbl1 +-- However, that requires the following +-- MAPREDUCE-1501 with mapred.input.dir.recursive set to true +-- This is not present in hadoop 20. +-- However, when HIVE-3344 is fixed, this test can only be run on hadoop 23 + +select * from outputTbl1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.values EXPRESSION [] +1 3 +1 1 +1 2 +2 3 +2 1 +2 2 +3 3 +3 1 +3 2 +7 3 +7 1 +7 2 +8 3 +8 1 +8 2 +8 3 +8 1 +8 2 Index: ql/src/test/results/clientpositive/union_remove_17.q.out =================================================================== --- ql/src/test/results/clientpositive/union_remove_17.q.out (revision 0) +++ ql/src/test/results/clientpositive/union_remove_17.q.out (working copy) @@ -0,0 +1,283 @@ +PREHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 2 map-reduce subqueries is performed followed by select star and a file sink +-- and the results are written to a table using dynamic partitions. +-- There is no need for this optimization, since the query is a map-only query. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- off + +create table inputTbl1(key string, val string) stored as textfile +PREHOOK: type: CREATETABLE +POSTHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 2 map-reduce subqueries is performed followed by select star and a file sink +-- and the results are written to a table using dynamic partitions. +-- There is no need for this optimization, since the query is a map-only query. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- off + +create table inputTbl1(key string, val string) stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@inputTbl1 +PREHOOK: query: create table outputTbl1(key string, values bigint) partitioned by (ds string) stored as rcfile +PREHOOK: type: CREATETABLE +POSTHOOK: query: create table outputTbl1(key string, values bigint) partitioned by (ds string) stored as rcfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@outputTbl1 +PREHOOK: query: load data local inpath '../data/files/T1.txt' into table inputTbl1 +PREHOOK: type: LOAD +PREHOOK: Output: default@inputtbl1 +POSTHOOK: query: load data local inpath '../data/files/T1.txt' into table inputTbl1 +POSTHOOK: type: LOAD +POSTHOOK: Output: default@inputtbl1 +PREHOOK: query: explain +insert overwrite table outputTbl1 partition (ds) +SELECT * +FROM ( + SELECT key, 1 as values, '1' as ds from inputTbl1 + UNION ALL + SELECT key, 2 as values, '2' as ds from inputTbl1 +) a +PREHOOK: type: QUERY +POSTHOOK: query: explain +insert overwrite table outputTbl1 partition (ds) +SELECT * +FROM ( + SELECT key, 1 as values, '1' as ds from inputTbl1 + UNION ALL + SELECT key, 2 as values, '2' as ds from inputTbl1 +) a +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_UNION (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME inputTbl1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR 1 values) (TOK_SELEXPR '1' ds)))) (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME inputTbl1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR 2 values) (TOK_SELEXPR '2' ds))))) a)) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME outputTbl1) (TOK_PARTSPEC (TOK_PARTVAL ds)))) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + null-subquery1:a-subquery1:inputtbl1 + TableScan + alias: inputtbl1 + Select Operator + expressions: + expr: key + type: string + expr: 1 + type: int + expr: '1' + type: string + outputColumnNames: _col0, _col1, _col2 + Union + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: int + expr: _col2 + type: string + outputColumnNames: _col0, _col1, _col2 + Select Operator + expressions: + expr: _col0 + type: string + expr: UDFToLong(_col1) + type: bigint + expr: _col2 + type: string + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.outputtbl1 + null-subquery2:a-subquery2:inputtbl1 + TableScan + alias: inputtbl1 + Select Operator + expressions: + expr: key + type: string + expr: 2 + type: int + expr: '2' + type: string + outputColumnNames: _col0, _col1, _col2 + Union + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: int + expr: _col2 + type: string + outputColumnNames: _col0, _col1, _col2 + Select Operator + expressions: + expr: _col0 + type: string + expr: UDFToLong(_col1) + type: bigint + expr: _col2 + type: string + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.outputtbl1 + + Stage: Stage-0 + Move Operator + tables: + partition: + ds + replace: true + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.outputtbl1 + + Stage: Stage-2 + Stats-Aggr Operator + + +PREHOOK: query: insert overwrite table outputTbl1 partition (ds) +SELECT * +FROM ( + SELECT key, 1 as values, '1' as ds from inputTbl1 + UNION ALL + SELECT key, 2 as values, '2' as ds from inputTbl1 +) a +PREHOOK: type: QUERY +PREHOOK: Input: default@inputtbl1 +PREHOOK: Output: default@outputtbl1 +POSTHOOK: query: insert overwrite table outputTbl1 partition (ds) +SELECT * +FROM ( + SELECT key, 1 as values, '1' as ds from inputTbl1 + UNION ALL + SELECT key, 2 as values, '2' as ds from inputTbl1 +) a +POSTHOOK: type: QUERY +POSTHOOK: Input: default@inputtbl1 +POSTHOOK: Output: default@outputtbl1@ds=1 +POSTHOOK: Output: default@outputtbl1@ds=2 +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=1).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=1).values EXPRESSION [] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=2).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=2).values EXPRESSION [] +PREHOOK: query: desc formatted outputTbl1 +PREHOOK: type: DESCTABLE +POSTHOOK: query: desc formatted outputTbl1 +POSTHOOK: type: DESCTABLE +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=1).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=1).values EXPRESSION [] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=2).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=2).values EXPRESSION [] +# col_name data_type comment + +key string None +values bigint None + +# Partition Information +# col_name data_type comment + +ds string None + +# Detailed Table Information +Database: default +#### A masked pattern was here #### +Protect Mode: None +Retention: 0 +#### A masked pattern was here #### +Table Type: MANAGED_TABLE +Table Parameters: + numFiles 2 + numPartitions 2 + numRows 12 + rawDataSize 24 + totalSize 182 +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe +InputFormat: org.apache.hadoop.hive.ql.io.RCFileInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.RCFileOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: show partitions outputTbl1 +PREHOOK: type: SHOWPARTITIONS +POSTHOOK: query: show partitions outputTbl1 +POSTHOOK: type: SHOWPARTITIONS +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=1).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=1).values EXPRESSION [] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=2).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=2).values EXPRESSION [] +ds=1 +ds=2 +PREHOOK: query: -- Ideally, there should be a query to execute a map-reduce job on outputTbl1. +-- something like, select count(1) from outputTbl1 +-- However, that requires the following +-- MAPREDUCE-1501 with mapred.input.dir.recursive set to true +-- This is not present in hadoop 20. +-- However, when HIVE-3344 is fixed, this test can only be run on hadoop 23 + +select * from outputTbl1 where ds = '1' +PREHOOK: type: QUERY +PREHOOK: Input: default@outputtbl1@ds=1 +#### A masked pattern was here #### +POSTHOOK: query: -- Ideally, there should be a query to execute a map-reduce job on outputTbl1. +-- something like, select count(1) from outputTbl1 +-- However, that requires the following +-- MAPREDUCE-1501 with mapred.input.dir.recursive set to true +-- This is not present in hadoop 20. +-- However, when HIVE-3344 is fixed, this test can only be run on hadoop 23 + +select * from outputTbl1 where ds = '1' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@outputtbl1@ds=1 +#### A masked pattern was here #### +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=1).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=1).values EXPRESSION [] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=2).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=2).values EXPRESSION [] +1 1 1 +2 1 1 +3 1 1 +7 1 1 +8 1 1 +8 1 1 +PREHOOK: query: select * from outputTbl1 where ds = '2' +PREHOOK: type: QUERY +PREHOOK: Input: default@outputtbl1@ds=2 +#### A masked pattern was here #### +POSTHOOK: query: select * from outputTbl1 where ds = '2' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@outputtbl1@ds=2 +#### A masked pattern was here #### +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=1).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=1).values EXPRESSION [] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=2).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=2).values EXPRESSION [] +1 2 2 +2 2 2 +3 2 2 +7 2 2 +8 2 2 +8 2 2 Index: ql/src/test/results/clientpositive/union_remove_12.q.out =================================================================== --- ql/src/test/results/clientpositive/union_remove_12.q.out (revision 0) +++ ql/src/test/results/clientpositive/union_remove_12.q.out (working copy) @@ -0,0 +1,324 @@ +PREHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 2 subqueries is performed (one of which is a map-only query, and the +-- other one is a map-join query), followed by select star and a file sink. +-- There is no need for the union optimization, since the whole query can be performed +-- in a single map-only job +-- The final file format is different from the input and intermediate file format. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- on + +create table inputTbl1(key string, val string) stored as textfile +PREHOOK: type: CREATETABLE +POSTHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 2 subqueries is performed (one of which is a map-only query, and the +-- other one is a map-join query), followed by select star and a file sink. +-- There is no need for the union optimization, since the whole query can be performed +-- in a single map-only job +-- The final file format is different from the input and intermediate file format. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- on + +create table inputTbl1(key string, val string) stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@inputTbl1 +PREHOOK: query: create table outputTbl1(key string, values bigint) stored as rcfile +PREHOOK: type: CREATETABLE +POSTHOOK: query: create table outputTbl1(key string, values bigint) stored as rcfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@outputTbl1 +PREHOOK: query: load data local inpath '../data/files/T1.txt' into table inputTbl1 +PREHOOK: type: LOAD +PREHOOK: Output: default@inputtbl1 +POSTHOOK: query: load data local inpath '../data/files/T1.txt' into table inputTbl1 +POSTHOOK: type: LOAD +POSTHOOK: Output: default@inputtbl1 +PREHOOK: query: explain +insert overwrite table outputTbl1 +SELECT * FROM +( +select key, 1 as values from inputTbl1 +union all +select /*+ mapjoin(a) */ a.key as key, b.val as values +FROM inputTbl1 a join inputTbl1 b on a.key=b.key +)c +PREHOOK: type: QUERY +POSTHOOK: query: explain +insert overwrite table outputTbl1 +SELECT * FROM +( +select key, 1 as values from inputTbl1 +union all +select /*+ mapjoin(a) */ a.key as key, b.val as values +FROM inputTbl1 a join inputTbl1 b on a.key=b.key +)c +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_UNION (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME inputTbl1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR 1 values)))) (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME inputTbl1) a) (TOK_TABREF (TOK_TABNAME inputTbl1) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_HINTLIST (TOK_HINT TOK_MAPJOIN (TOK_HINTARGLIST a))) (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key) key) (TOK_SELEXPR (. (TOK_TABLE_OR_COL b) val) values))))) c)) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME outputTbl1))) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)))) + +STAGE DEPENDENCIES: + Stage-12 is a root stage + Stage-1 depends on stages: Stage-12 + Stage-2 depends on stages: Stage-1 + Stage-3 depends on stages: Stage-2 + Stage-9 depends on stages: Stage-3 , consists of Stage-6, Stage-5, Stage-7 + Stage-6 + Stage-0 depends on stages: Stage-6, Stage-5, Stage-8 + Stage-4 depends on stages: Stage-0 + Stage-5 + Stage-7 + Stage-8 depends on stages: Stage-7 + +STAGE PLANS: + Stage: Stage-12 + Map Reduce Local Work + Alias -> Map Local Tables: + null-subquery2:c-subquery2:a + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + null-subquery2:c-subquery2:a + TableScan + alias: a + HashTable Sink Operator + condition expressions: + 0 {key} + 1 {val} + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 1 + + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + null-subquery2:c-subquery2:b + TableScan + alias: b + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} + 1 {val} + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + outputColumnNames: _col0, _col5 + Position of Big Table: 1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + Local Work: + Map Reduce Local Work + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + Select Operator + expressions: + expr: _col0 + type: string + expr: _col5 + type: string + outputColumnNames: _col0, _col5 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col5 + type: string + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-3 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + TableScan + Union + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: string + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: UDFToLong(_col1) + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.outputtbl1 + null-subquery1:c-subquery1:inputtbl1 + TableScan + alias: inputtbl1 + Select Operator + expressions: + expr: key + type: string + expr: 1 + type: int + outputColumnNames: _col0, _col1 + Union + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: string + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: UDFToLong(_col1) + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.outputtbl1 + + Stage: Stage-9 + Conditional Operator + + Stage: Stage-6 + Move Operator + files: + hdfs directory: true +#### A masked pattern was here #### + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.outputtbl1 + + Stage: Stage-4 + Stats-Aggr Operator + + Stage: Stage-5 + Block level merge + + Stage: Stage-7 + Block level merge + + Stage: Stage-8 + Move Operator + files: + hdfs directory: true +#### A masked pattern was here #### + + +PREHOOK: query: insert overwrite table outputTbl1 +SELECT * FROM +( +select key, 1 as values from inputTbl1 +union all +select /*+ mapjoin(a) */ a.key as key, b.val as values +FROM inputTbl1 a join inputTbl1 b on a.key=b.key +)c +PREHOOK: type: QUERY +PREHOOK: Input: default@inputtbl1 +PREHOOK: Output: default@outputtbl1 +POSTHOOK: query: insert overwrite table outputTbl1 +SELECT * FROM +( +select key, 1 as values from inputTbl1 +union all +select /*+ mapjoin(a) */ a.key as key, b.val as values +FROM inputTbl1 a join inputTbl1 b on a.key=b.key +)c +POSTHOOK: type: QUERY +POSTHOOK: Input: default@inputtbl1 +POSTHOOK: Output: default@outputtbl1 +POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(inputtbl1)a.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.values EXPRESSION [(inputtbl1)b.FieldSchema(name:val, type:string, comment:null), ] +PREHOOK: query: desc formatted outputTbl1 +PREHOOK: type: DESCTABLE +POSTHOOK: query: desc formatted outputTbl1 +POSTHOOK: type: DESCTABLE +POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(inputtbl1)a.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.values EXPRESSION [(inputtbl1)b.FieldSchema(name:val, type:string, comment:null), ] +# col_name data_type comment + +key string None +values bigint None + +# Detailed Table Information +Database: default +#### A masked pattern was here #### +Protect Mode: None +Retention: 0 +#### A masked pattern was here #### +Table Type: MANAGED_TABLE +Table Parameters: + numFiles 2 + numPartitions 0 + numRows 14 + rawDataSize 36 + totalSize 194 +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe +InputFormat: org.apache.hadoop.hive.ql.io.RCFileInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.RCFileOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: select * from outputTbl1 +PREHOOK: type: QUERY +PREHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +POSTHOOK: query: select * from outputTbl1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(inputtbl1)a.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.values EXPRESSION [(inputtbl1)b.FieldSchema(name:val, type:string, comment:null), ] +1 11 +2 12 +3 13 +7 17 +8 18 +8 18 +8 28 +8 28 +1 1 +2 1 +3 1 +7 1 +8 1 +8 1 Index: ql/src/test/results/clientpositive/union_remove_5.q.out =================================================================== --- ql/src/test/results/clientpositive/union_remove_5.q.out (revision 0) +++ ql/src/test/results/clientpositive/union_remove_5.q.out (working copy) @@ -0,0 +1,324 @@ +PREHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 3 subqueries is performed (exactly one of which requires a map-reduce job) +-- followed by select star and a file sink. +-- There is no need to write the temporary results of the sub-queries, and then read them +-- again to process the union. The union can be removed completely. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- on + +-- There is a issue with merge for hadoop 20. +-- Till tests start working completely for hadoop 23, making sure that the merge is not +-- triggered, by setting small files average size artificially low + +create table inputTbl1(key string, val string) stored as textfile +PREHOOK: type: CREATETABLE +POSTHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 3 subqueries is performed (exactly one of which requires a map-reduce job) +-- followed by select star and a file sink. +-- There is no need to write the temporary results of the sub-queries, and then read them +-- again to process the union. The union can be removed completely. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- on + +-- There is a issue with merge for hadoop 20. +-- Till tests start working completely for hadoop 23, making sure that the merge is not +-- triggered, by setting small files average size artificially low + +create table inputTbl1(key string, val string) stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@inputTbl1 +PREHOOK: query: create table outputTbl1(key string, values bigint) stored as textfile +PREHOOK: type: CREATETABLE +POSTHOOK: query: create table outputTbl1(key string, values bigint) stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@outputTbl1 +PREHOOK: query: load data local inpath '../data/files/T1.txt' into table inputTbl1 +PREHOOK: type: LOAD +PREHOOK: Output: default@inputtbl1 +POSTHOOK: query: load data local inpath '../data/files/T1.txt' into table inputTbl1 +POSTHOOK: type: LOAD +POSTHOOK: Output: default@inputtbl1 +PREHOOK: query: explain +insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, 1 as values from inputTbl1 + UNION ALL + SELECT key, 2 as values from inputTbl1 +) a +PREHOOK: type: QUERY +POSTHOOK: query: explain +insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, 1 as values from inputTbl1 + UNION ALL + SELECT key, 2 as values from inputTbl1 +) a +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_UNION (TOK_UNION (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME inputTbl1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count 1) values)) (TOK_GROUPBY (TOK_TABLE_OR_COL key)))) (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME inputTbl1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR 1 values))))) (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME inputTbl1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR 2 values))))) a)) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME outputTbl1))) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-7 depends on stages: Stage-1, Stage-8, Stage-9 , consists of Stage-4, Stage-3, Stage-5 + Stage-4 + Stage-0 depends on stages: Stage-4, Stage-3, Stage-6 + Stage-2 depends on stages: Stage-0 + Stage-3 + Stage-5 + Stage-6 depends on stages: Stage-5 + Stage-8 is a root stage + Stage-9 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + null-subquery2:a-subquery2:inputtbl1 + TableScan + alias: inputtbl1 + Select Operator + expressions: + expr: key + type: string + expr: 2 + type: int + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + + Stage: Stage-7 + Conditional Operator + + Stage: Stage-4 + Move Operator + files: + hdfs directory: true +#### A masked pattern was here #### + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + + Stage: Stage-2 + Stats-Aggr Operator + + Stage: Stage-3 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + + Stage: Stage-5 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + + Stage: Stage-6 + Move Operator + files: + hdfs directory: true +#### A masked pattern was here #### + + Stage: Stage-8 + Map Reduce + Alias -> Map Operator Tree: + null-subquery1-subquery1:a-subquery1-subquery1:inputtbl1 + TableScan + alias: inputtbl1 + Select Operator + expressions: + expr: key + type: string + outputColumnNames: key + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: key + type: string + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: -1 + value expressions: + expr: _col1 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: string + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + + Stage: Stage-9 + Map Reduce + Alias -> Map Operator Tree: + null-subquery1-subquery2:a-subquery1-subquery2:inputtbl1 + TableScan + alias: inputtbl1 + Select Operator + expressions: + expr: key + type: string + expr: 1 + type: int + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + + +PREHOOK: query: insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, 1 as values from inputTbl1 + UNION ALL + SELECT key, 2 as values from inputTbl1 +) a +PREHOOK: type: QUERY +PREHOOK: Input: default@inputtbl1 +PREHOOK: Output: default@outputtbl1 +POSTHOOK: query: insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, 1 as values from inputTbl1 + UNION ALL + SELECT key, 2 as values from inputTbl1 +) a +POSTHOOK: type: QUERY +POSTHOOK: Input: default@inputtbl1 +POSTHOOK: Output: default@outputtbl1 +POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.values EXPRESSION [(inputtbl1)inputtbl1.null, ] +PREHOOK: query: desc formatted outputTbl1 +PREHOOK: type: DESCTABLE +POSTHOOK: query: desc formatted outputTbl1 +POSTHOOK: type: DESCTABLE +POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.values EXPRESSION [(inputtbl1)inputtbl1.null, ] +# col_name data_type comment + +key string None +values bigint None + +# Detailed Table Information +Database: default +#### A masked pattern was here #### +Protect Mode: None +Retention: 0 +#### A masked pattern was here #### +Table Type: MANAGED_TABLE +Table Parameters: + numFiles 3 + numPartitions 0 + numRows 6 + rawDataSize 18 + totalSize 408 +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +InputFormat: org.apache.hadoop.mapred.TextInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: select * from outputTbl1 +PREHOOK: type: QUERY +PREHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +POSTHOOK: query: select * from outputTbl1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.values EXPRESSION [(inputtbl1)inputtbl1.null, ] +1 2 +2 2 +3 2 +7 2 +8 2 +8 2 +1 1 +2 1 +3 1 +7 1 +8 2 +1 1 +2 1 +3 1 +7 1 +8 1 +8 1 Index: ql/src/test/results/clientpositive/union_remove_14.q.out =================================================================== --- ql/src/test/results/clientpositive/union_remove_14.q.out (revision 0) +++ ql/src/test/results/clientpositive/union_remove_14.q.out (working copy) @@ -0,0 +1,422 @@ +PREHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 2 subqueries is performed (one of which is a map-only query, and the +-- other one contains a join, which should be performed as a map-join query at runtime), +-- followed by select star and a file sink. +-- The union selectstar optimization should be performed, and the union should be removed. +-- The final file format is different from the input and intermediate file format. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- on + +create table inputTbl1(key string, val string) stored as textfile +PREHOOK: type: CREATETABLE +POSTHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 2 subqueries is performed (one of which is a map-only query, and the +-- other one contains a join, which should be performed as a map-join query at runtime), +-- followed by select star and a file sink. +-- The union selectstar optimization should be performed, and the union should be removed. +-- The final file format is different from the input and intermediate file format. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- on + +create table inputTbl1(key string, val string) stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@inputTbl1 +PREHOOK: query: create table outputTbl1(key string, values bigint) stored as rcfile +PREHOOK: type: CREATETABLE +POSTHOOK: query: create table outputTbl1(key string, values bigint) stored as rcfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@outputTbl1 +PREHOOK: query: load data local inpath '../data/files/T1.txt' into table inputTbl1 +PREHOOK: type: LOAD +PREHOOK: Output: default@inputtbl1 +POSTHOOK: query: load data local inpath '../data/files/T1.txt' into table inputTbl1 +POSTHOOK: type: LOAD +POSTHOOK: Output: default@inputtbl1 +PREHOOK: query: explain +insert overwrite table outputTbl1 +SELECT * FROM +( +select key, 1 as values from inputTbl1 +union all +select a.key as key, b.val as values +FROM inputTbl1 a join inputTbl1 b on a.key=b.key +)c +PREHOOK: type: QUERY +POSTHOOK: query: explain +insert overwrite table outputTbl1 +SELECT * FROM +( +select key, 1 as values from inputTbl1 +union all +select a.key as key, b.val as values +FROM inputTbl1 a join inputTbl1 b on a.key=b.key +)c +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_UNION (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME inputTbl1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR 1 values)))) (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME inputTbl1) a) (TOK_TABREF (TOK_TABNAME inputTbl1) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key) key) (TOK_SELEXPR (. (TOK_TABLE_OR_COL b) val) values))))) c)) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME outputTbl1))) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)))) + +STAGE DEPENDENCIES: + Stage-13 is a root stage , consists of Stage-14, Stage-15, Stage-1 + Stage-14 has a backup stage: Stage-1 + Stage-11 depends on stages: Stage-14 + Stage-2 depends on stages: Stage-1, Stage-11, Stage-12 + Stage-8 depends on stages: Stage-2 , consists of Stage-5, Stage-4, Stage-6 + Stage-5 + Stage-0 depends on stages: Stage-5, Stage-4, Stage-7 + Stage-3 depends on stages: Stage-0 + Stage-4 + Stage-6 + Stage-7 depends on stages: Stage-6 + Stage-15 has a backup stage: Stage-1 + Stage-12 depends on stages: Stage-15 + Stage-1 + +STAGE PLANS: + Stage: Stage-13 + Conditional Operator + + Stage: Stage-14 + Map Reduce Local Work + Alias -> Map Local Tables: + null-subquery2:c-subquery2:b + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + null-subquery2:c-subquery2:b + TableScan + alias: b + HashTable Sink Operator + condition expressions: + 0 {key} + 1 {val} + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 0 + + Stage: Stage-11 + Map Reduce + Alias -> Map Operator Tree: + null-subquery2:c-subquery2:a + TableScan + alias: a + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} + 1 {val} + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + outputColumnNames: _col0, _col5 + Position of Big Table: 0 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col5 + type: string + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + Local Work: + Map Reduce Local Work + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + TableScan + Union + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: string + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: UDFToLong(_col1) + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.outputtbl1 + null-subquery1:c-subquery1:inputtbl1 + TableScan + alias: inputtbl1 + Select Operator + expressions: + expr: key + type: string + expr: 1 + type: int + outputColumnNames: _col0, _col1 + Union + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: string + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: UDFToLong(_col1) + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.outputtbl1 + + Stage: Stage-8 + Conditional Operator + + Stage: Stage-5 + Move Operator + files: + hdfs directory: true +#### A masked pattern was here #### + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.outputtbl1 + + Stage: Stage-3 + Stats-Aggr Operator + + Stage: Stage-4 + Block level merge + + Stage: Stage-6 + Block level merge + + Stage: Stage-7 + Move Operator + files: + hdfs directory: true +#### A masked pattern was here #### + + Stage: Stage-15 + Map Reduce Local Work + Alias -> Map Local Tables: + null-subquery2:c-subquery2:a + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + null-subquery2:c-subquery2:a + TableScan + alias: a + HashTable Sink Operator + condition expressions: + 0 {key} + 1 {val} + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 1 + + Stage: Stage-12 + Map Reduce + Alias -> Map Operator Tree: + null-subquery2:c-subquery2:b + TableScan + alias: b + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} + 1 {val} + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + outputColumnNames: _col0, _col5 + Position of Big Table: 1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col5 + type: string + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + Local Work: + Map Reduce Local Work + + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + null-subquery2:c-subquery2:a + TableScan + alias: a + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 0 + value expressions: + expr: key + type: string + null-subquery2:c-subquery2:b + TableScan + alias: b + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 1 + value expressions: + expr: val + type: string + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {VALUE._col0} + 1 {VALUE._col1} + handleSkewJoin: false + outputColumnNames: _col0, _col5 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col5 + type: string + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + +PREHOOK: query: insert overwrite table outputTbl1 +SELECT * FROM +( +select key, 1 as values from inputTbl1 +union all +select a.key as key, b.val as values +FROM inputTbl1 a join inputTbl1 b on a.key=b.key +)c +PREHOOK: type: QUERY +PREHOOK: Input: default@inputtbl1 +PREHOOK: Output: default@outputtbl1 +POSTHOOK: query: insert overwrite table outputTbl1 +SELECT * FROM +( +select key, 1 as values from inputTbl1 +union all +select a.key as key, b.val as values +FROM inputTbl1 a join inputTbl1 b on a.key=b.key +)c +POSTHOOK: type: QUERY +POSTHOOK: Input: default@inputtbl1 +POSTHOOK: Output: default@outputtbl1 +POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(inputtbl1)a.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.values EXPRESSION [(inputtbl1)b.FieldSchema(name:val, type:string, comment:null), ] +PREHOOK: query: desc formatted outputTbl1 +PREHOOK: type: DESCTABLE +POSTHOOK: query: desc formatted outputTbl1 +POSTHOOK: type: DESCTABLE +POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(inputtbl1)a.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.values EXPRESSION [(inputtbl1)b.FieldSchema(name:val, type:string, comment:null), ] +# col_name data_type comment + +key string None +values bigint None + +# Detailed Table Information +Database: default +#### A masked pattern was here #### +Protect Mode: None +Retention: 0 +#### A masked pattern was here #### +Table Type: MANAGED_TABLE +Table Parameters: + numFiles 2 + numPartitions 0 + numRows 14 + rawDataSize 36 + totalSize 194 +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe +InputFormat: org.apache.hadoop.hive.ql.io.RCFileInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.RCFileOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: select * from outputTbl1 +PREHOOK: type: QUERY +PREHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +POSTHOOK: query: select * from outputTbl1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(inputtbl1)a.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.values EXPRESSION [(inputtbl1)b.FieldSchema(name:val, type:string, comment:null), ] +1 11 +2 12 +3 13 +7 17 +8 18 +8 28 +8 18 +8 28 +1 1 +2 1 +3 1 +7 1 +8 1 +8 1 Index: ql/src/test/results/clientpositive/union_remove_7.q.out =================================================================== --- ql/src/test/results/clientpositive/union_remove_7.q.out (revision 0) +++ ql/src/test/results/clientpositive/union_remove_7.q.out (working copy) @@ -0,0 +1,268 @@ +PREHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 2 map-reduce subqueries is performed followed by select star and a file sink +-- There is no need to write the temporary results of the sub-queries, and then read them +-- again to process the union. The union can be removed completely. +-- The final file format is different from the input and intermediate file format. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- off + +create table inputTbl1(key string, val string) stored as textfile +PREHOOK: type: CREATETABLE +POSTHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 2 map-reduce subqueries is performed followed by select star and a file sink +-- There is no need to write the temporary results of the sub-queries, and then read them +-- again to process the union. The union can be removed completely. +-- The final file format is different from the input and intermediate file format. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- off + +create table inputTbl1(key string, val string) stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@inputTbl1 +PREHOOK: query: create table outputTbl1(key string, values bigint) stored as rcfile +PREHOOK: type: CREATETABLE +POSTHOOK: query: create table outputTbl1(key string, values bigint) stored as rcfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@outputTbl1 +PREHOOK: query: load data local inpath '../data/files/T1.txt' into table inputTbl1 +PREHOOK: type: LOAD +PREHOOK: Output: default@inputtbl1 +POSTHOOK: query: load data local inpath '../data/files/T1.txt' into table inputTbl1 +POSTHOOK: type: LOAD +POSTHOOK: Output: default@inputtbl1 +PREHOOK: query: explain +insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values from inputTbl1 group by key +) a +PREHOOK: type: QUERY +POSTHOOK: query: explain +insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values from inputTbl1 group by key +) a +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_UNION (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME inputTbl1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count 1) values)) (TOK_GROUPBY (TOK_TABLE_OR_COL key)))) (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME inputTbl1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count 1) values)) (TOK_GROUPBY (TOK_TABLE_OR_COL key))))) a)) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME outputTbl1))) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1, Stage-3 + Stage-2 depends on stages: Stage-0 + Stage-3 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + null-subquery2:a-subquery2:inputtbl1 + TableScan + alias: inputtbl1 + Select Operator + expressions: + expr: key + type: string + outputColumnNames: key + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: key + type: string + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: -1 + value expressions: + expr: _col1 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: string + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.outputtbl1 + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.outputtbl1 + + Stage: Stage-2 + Stats-Aggr Operator + + Stage: Stage-3 + Map Reduce + Alias -> Map Operator Tree: + null-subquery1:a-subquery1:inputtbl1 + TableScan + alias: inputtbl1 + Select Operator + expressions: + expr: key + type: string + outputColumnNames: key + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: key + type: string + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: -1 + value expressions: + expr: _col1 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: string + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.outputtbl1 + + +PREHOOK: query: insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values from inputTbl1 group by key +) a +PREHOOK: type: QUERY +PREHOOK: Input: default@inputtbl1 +PREHOOK: Output: default@outputtbl1 +POSTHOOK: query: insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values from inputTbl1 group by key +) a +POSTHOOK: type: QUERY +POSTHOOK: Input: default@inputtbl1 +POSTHOOK: Output: default@outputtbl1 +POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +PREHOOK: query: desc formatted outputTbl1 +PREHOOK: type: DESCTABLE +POSTHOOK: query: desc formatted outputTbl1 +POSTHOOK: type: DESCTABLE +POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +# col_name data_type comment + +key string None +values bigint None + +# Detailed Table Information +Database: default +#### A masked pattern was here #### +Protect Mode: None +Retention: 0 +#### A masked pattern was here #### +Table Type: MANAGED_TABLE +Table Parameters: + numFiles 2 + numPartitions 0 + numRows 5 + rawDataSize 10 + totalSize 272 +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe +InputFormat: org.apache.hadoop.hive.ql.io.RCFileInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.RCFileOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: select * from outputTbl1 +PREHOOK: type: QUERY +PREHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +POSTHOOK: query: select * from outputTbl1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +1 1 +2 1 +3 1 +7 1 +8 2 +1 1 +2 1 +3 1 +7 1 +8 2 Index: ql/src/test/results/clientpositive/union_remove_2.q.out =================================================================== --- ql/src/test/results/clientpositive/union_remove_2.q.out (revision 0) +++ ql/src/test/results/clientpositive/union_remove_2.q.out (working copy) @@ -0,0 +1,284 @@ +PREHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 3 subqueries is performed (exactly one of which requires a map-reduce job) +-- followed by select star and a file sink. +-- There is no need to write the temporary results of the sub-queries, and then read them +-- again to process the union. The union can be removed completely. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- off + +create table inputTbl1(key string, val string) stored as textfile +PREHOOK: type: CREATETABLE +POSTHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 3 subqueries is performed (exactly one of which requires a map-reduce job) +-- followed by select star and a file sink. +-- There is no need to write the temporary results of the sub-queries, and then read them +-- again to process the union. The union can be removed completely. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- off + +create table inputTbl1(key string, val string) stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@inputTbl1 +PREHOOK: query: create table outputTbl1(key string, values bigint) stored as textfile +PREHOOK: type: CREATETABLE +POSTHOOK: query: create table outputTbl1(key string, values bigint) stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@outputTbl1 +PREHOOK: query: load data local inpath '../data/files/T1.txt' into table inputTbl1 +PREHOOK: type: LOAD +PREHOOK: Output: default@inputtbl1 +POSTHOOK: query: load data local inpath '../data/files/T1.txt' into table inputTbl1 +POSTHOOK: type: LOAD +POSTHOOK: Output: default@inputtbl1 +PREHOOK: query: explain +insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, 1 as values from inputTbl1 + UNION ALL + SELECT key, 2 as values from inputTbl1 +) a +PREHOOK: type: QUERY +POSTHOOK: query: explain +insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, 1 as values from inputTbl1 + UNION ALL + SELECT key, 2 as values from inputTbl1 +) a +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_UNION (TOK_UNION (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME inputTbl1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count 1) values)) (TOK_GROUPBY (TOK_TABLE_OR_COL key)))) (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME inputTbl1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR 1 values))))) (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME inputTbl1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR 2 values))))) a)) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME outputTbl1))) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1, Stage-3, Stage-4 + Stage-2 depends on stages: Stage-0 + Stage-3 is a root stage + Stage-4 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + null-subquery2:a-subquery2:inputtbl1 + TableScan + alias: inputtbl1 + Select Operator + expressions: + expr: key + type: string + expr: 2 + type: int + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + + Stage: Stage-2 + Stats-Aggr Operator + + Stage: Stage-3 + Map Reduce + Alias -> Map Operator Tree: + null-subquery1-subquery1:a-subquery1-subquery1:inputtbl1 + TableScan + alias: inputtbl1 + Select Operator + expressions: + expr: key + type: string + outputColumnNames: key + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: key + type: string + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: -1 + value expressions: + expr: _col1 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: string + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + + Stage: Stage-4 + Map Reduce + Alias -> Map Operator Tree: + null-subquery1-subquery2:a-subquery1-subquery2:inputtbl1 + TableScan + alias: inputtbl1 + Select Operator + expressions: + expr: key + type: string + expr: 1 + type: int + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + + +PREHOOK: query: insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, 1 as values from inputTbl1 + UNION ALL + SELECT key, 2 as values from inputTbl1 +) a +PREHOOK: type: QUERY +PREHOOK: Input: default@inputtbl1 +PREHOOK: Output: default@outputtbl1 +POSTHOOK: query: insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, 1 as values from inputTbl1 + UNION ALL + SELECT key, 2 as values from inputTbl1 +) a +POSTHOOK: type: QUERY +POSTHOOK: Input: default@inputtbl1 +POSTHOOK: Output: default@outputtbl1 +POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.values EXPRESSION [(inputtbl1)inputtbl1.null, ] +PREHOOK: query: desc formatted outputTbl1 +PREHOOK: type: DESCTABLE +POSTHOOK: query: desc formatted outputTbl1 +POSTHOOK: type: DESCTABLE +POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.values EXPRESSION [(inputtbl1)inputtbl1.null, ] +# col_name data_type comment + +key string None +values bigint None + +# Detailed Table Information +Database: default +#### A masked pattern was here #### +Protect Mode: None +Retention: 0 +#### A masked pattern was here #### +Table Type: MANAGED_TABLE +Table Parameters: + numFiles 3 + numPartitions 0 + numRows 6 + rawDataSize 18 + totalSize 408 +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +InputFormat: org.apache.hadoop.mapred.TextInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: -- Ideally, there should be a query to execute a map-reduce job on outputTbl1. +-- something like, select count(1) from outputTbl1 +-- However, that requires the following +-- MAPREDUCE-1501 with mapred.input.dir.recursive set to true +-- This is not present in hadoop 20. +-- However, when HIVE-3344 is fixed, this test can only be run on hadoop 23 + +select * from outputTbl1 +PREHOOK: type: QUERY +PREHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +POSTHOOK: query: -- Ideally, there should be a query to execute a map-reduce job on outputTbl1. +-- something like, select count(1) from outputTbl1 +-- However, that requires the following +-- MAPREDUCE-1501 with mapred.input.dir.recursive set to true +-- This is not present in hadoop 20. +-- However, when HIVE-3344 is fixed, this test can only be run on hadoop 23 + +select * from outputTbl1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.values EXPRESSION [(inputtbl1)inputtbl1.null, ] +1 2 +2 2 +3 2 +7 2 +8 2 +8 2 +1 1 +2 1 +3 1 +7 1 +8 2 +1 1 +2 1 +3 1 +7 1 +8 1 +8 1 Index: ql/src/test/results/clientpositive/union_remove_16.q.out =================================================================== --- ql/src/test/results/clientpositive/union_remove_16.q.out (revision 0) +++ ql/src/test/results/clientpositive/union_remove_16.q.out (working copy) @@ -0,0 +1,391 @@ +PREHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 2 map-reduce subqueries is performed followed by select star and a file sink +-- and the results are written to a table using dynamic partitions. +-- There is no need to write the temporary results of the sub-queries, and then read them +-- again to process the union. The union can be removed completely. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- on +-- Curently, this optimization does not work in the presence of dynamic partitions. + +create table inputTbl1(key string, val string) stored as textfile +PREHOOK: type: CREATETABLE +POSTHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 2 map-reduce subqueries is performed followed by select star and a file sink +-- and the results are written to a table using dynamic partitions. +-- There is no need to write the temporary results of the sub-queries, and then read them +-- again to process the union. The union can be removed completely. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- on +-- Curently, this optimization does not work in the presence of dynamic partitions. + +create table inputTbl1(key string, val string) stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@inputTbl1 +PREHOOK: query: create table outputTbl1(key string, values bigint) partitioned by (ds string) stored as rcfile +PREHOOK: type: CREATETABLE +POSTHOOK: query: create table outputTbl1(key string, values bigint) partitioned by (ds string) stored as rcfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@outputTbl1 +PREHOOK: query: load data local inpath '../data/files/T1.txt' into table inputTbl1 +PREHOOK: type: LOAD +PREHOOK: Output: default@inputtbl1 +POSTHOOK: query: load data local inpath '../data/files/T1.txt' into table inputTbl1 +POSTHOOK: type: LOAD +POSTHOOK: Output: default@inputtbl1 +PREHOOK: query: explain +insert overwrite table outputTbl1 partition (ds) +SELECT * +FROM ( + SELECT key, count(1) as values, '1' as ds from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values, '2' as ds from inputTbl1 group by key +) a +PREHOOK: type: QUERY +POSTHOOK: query: explain +insert overwrite table outputTbl1 partition (ds) +SELECT * +FROM ( + SELECT key, count(1) as values, '1' as ds from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values, '2' as ds from inputTbl1 group by key +) a +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_UNION (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME inputTbl1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count 1) values) (TOK_SELEXPR '1' ds)) (TOK_GROUPBY (TOK_TABLE_OR_COL key)))) (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME inputTbl1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count 1) values) (TOK_SELEXPR '2' ds)) (TOK_GROUPBY (TOK_TABLE_OR_COL key))))) a)) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME outputTbl1) (TOK_PARTSPEC (TOK_PARTVAL ds)))) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1, Stage-9 + Stage-8 depends on stages: Stage-2 , consists of Stage-5, Stage-4, Stage-6 + Stage-5 + Stage-0 depends on stages: Stage-5, Stage-4, Stage-7 + Stage-3 depends on stages: Stage-0 + Stage-4 + Stage-6 + Stage-7 depends on stages: Stage-6 + Stage-9 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + null-subquery2:a-subquery2:inputtbl1 + TableScan + alias: inputtbl1 + Select Operator + expressions: + expr: key + type: string + outputColumnNames: key + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: key + type: string + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: -1 + value expressions: + expr: _col1 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: string + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + expr: '2' + type: string + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + TableScan + Union + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + expr: _col2 + type: string + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.outputtbl1 +#### A masked pattern was here #### + TableScan + Union + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + expr: _col2 + type: string + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.outputtbl1 + + Stage: Stage-8 + Conditional Operator + + Stage: Stage-5 + Move Operator + files: + hdfs directory: true +#### A masked pattern was here #### + + Stage: Stage-0 + Move Operator + tables: + partition: + ds + replace: true + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.outputtbl1 + + Stage: Stage-3 + Stats-Aggr Operator + + Stage: Stage-4 + Block level merge + + Stage: Stage-6 + Block level merge + + Stage: Stage-7 + Move Operator + files: + hdfs directory: true +#### A masked pattern was here #### + + Stage: Stage-9 + Map Reduce + Alias -> Map Operator Tree: + null-subquery1:a-subquery1:inputtbl1 + TableScan + alias: inputtbl1 + Select Operator + expressions: + expr: key + type: string + outputColumnNames: key + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: key + type: string + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: -1 + value expressions: + expr: _col1 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: string + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + expr: '1' + type: string + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + +PREHOOK: query: insert overwrite table outputTbl1 partition (ds) +SELECT * +FROM ( + SELECT key, count(1) as values, '1' as ds from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values, '2' as ds from inputTbl1 group by key +) a +PREHOOK: type: QUERY +PREHOOK: Input: default@inputtbl1 +PREHOOK: Output: default@outputtbl1 +POSTHOOK: query: insert overwrite table outputTbl1 partition (ds) +SELECT * +FROM ( + SELECT key, count(1) as values, '1' as ds from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values, '2' as ds from inputTbl1 group by key +) a +POSTHOOK: type: QUERY +POSTHOOK: Input: default@inputtbl1 +POSTHOOK: Output: default@outputtbl1@ds=1 +POSTHOOK: Output: default@outputtbl1@ds=2 +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=1).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=1).values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=2).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=2).values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +PREHOOK: query: desc formatted outputTbl1 +PREHOOK: type: DESCTABLE +POSTHOOK: query: desc formatted outputTbl1 +POSTHOOK: type: DESCTABLE +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=1).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=1).values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=2).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=2).values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +# col_name data_type comment + +key string None +values bigint None + +# Partition Information +# col_name data_type comment + +ds string None + +# Detailed Table Information +Database: default +#### A masked pattern was here #### +Protect Mode: None +Retention: 0 +#### A masked pattern was here #### +Table Type: MANAGED_TABLE +Table Parameters: + numFiles 2 + numPartitions 2 + numRows 10 + rawDataSize 20 + totalSize 178 +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe +InputFormat: org.apache.hadoop.hive.ql.io.RCFileInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.RCFileOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: show partitions outputTbl1 +PREHOOK: type: SHOWPARTITIONS +POSTHOOK: query: show partitions outputTbl1 +POSTHOOK: type: SHOWPARTITIONS +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=1).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=1).values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=2).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=2).values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +ds=1 +ds=2 +PREHOOK: query: -- Ideally, there should be a query to execute a map-reduce job on outputTbl1. +-- something like, select count(1) from outputTbl1 +-- However, that requires the following +-- MAPREDUCE-1501 with mapred.input.dir.recursive set to true +-- This is not present in hadoop 20. +-- However, when HIVE-3344 is fixed, this test can only be run on hadoop 23 + +select * from outputTbl1 where ds = '1' +PREHOOK: type: QUERY +PREHOOK: Input: default@outputtbl1@ds=1 +#### A masked pattern was here #### +POSTHOOK: query: -- Ideally, there should be a query to execute a map-reduce job on outputTbl1. +-- something like, select count(1) from outputTbl1 +-- However, that requires the following +-- MAPREDUCE-1501 with mapred.input.dir.recursive set to true +-- This is not present in hadoop 20. +-- However, when HIVE-3344 is fixed, this test can only be run on hadoop 23 + +select * from outputTbl1 where ds = '1' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@outputtbl1@ds=1 +#### A masked pattern was here #### +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=1).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=1).values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=2).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=2).values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +1 1 1 +2 1 1 +3 1 1 +7 1 1 +8 2 1 +PREHOOK: query: select * from outputTbl1 where ds = '2' +PREHOOK: type: QUERY +PREHOOK: Input: default@outputtbl1@ds=2 +#### A masked pattern was here #### +POSTHOOK: query: select * from outputTbl1 where ds = '2' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@outputtbl1@ds=2 +#### A masked pattern was here #### +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=1).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=1).values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=2).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=2).values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +1 1 2 +2 1 2 +3 1 2 +7 1 2 +8 2 2 Index: ql/src/test/results/clientpositive/union_remove_11.q.out =================================================================== --- ql/src/test/results/clientpositive/union_remove_11.q.out (revision 0) +++ ql/src/test/results/clientpositive/union_remove_11.q.out (working copy) @@ -0,0 +1,332 @@ +PREHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 2 subqueries is performed (one of which is a map-only query, and the +-- other one contains a nested union where also contains map only sub-queries), +-- followed by select star and a file sink. +-- There is no need for the union optimization, since the whole query can be performed +-- in a single map-only job +-- The final file format is different from the input and intermediate file format. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- on + +-- There is a issue with merge for hadoop 20. +-- Till tests start working completely for hadoop 23, making sure that the merge is not +-- triggered, by setting small files average size artificially low + +create table inputTbl1(key string, val string) stored as textfile +PREHOOK: type: CREATETABLE +POSTHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 2 subqueries is performed (one of which is a map-only query, and the +-- other one contains a nested union where also contains map only sub-queries), +-- followed by select star and a file sink. +-- There is no need for the union optimization, since the whole query can be performed +-- in a single map-only job +-- The final file format is different from the input and intermediate file format. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- on + +-- There is a issue with merge for hadoop 20. +-- Till tests start working completely for hadoop 23, making sure that the merge is not +-- triggered, by setting small files average size artificially low + +create table inputTbl1(key string, val string) stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@inputTbl1 +PREHOOK: query: create table outputTbl1(key string, values bigint) stored as rcfile +PREHOOK: type: CREATETABLE +POSTHOOK: query: create table outputTbl1(key string, values bigint) stored as rcfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@outputTbl1 +PREHOOK: query: load data local inpath '../data/files/T1.txt' into table inputTbl1 +PREHOOK: type: LOAD +PREHOOK: Output: default@inputtbl1 +POSTHOOK: query: load data local inpath '../data/files/T1.txt' into table inputTbl1 +POSTHOOK: type: LOAD +POSTHOOK: Output: default@inputtbl1 +PREHOOK: query: explain +insert overwrite table outputTbl1 +SELECT * FROM +( +select key, 1 as values from inputTbl1 +union all +select * FROM ( + SELECT key, 2 values from inputTbl1 + UNION ALL + SELECT key, 3 as values from inputTbl1 +) a +)b +PREHOOK: type: QUERY +POSTHOOK: query: explain +insert overwrite table outputTbl1 +SELECT * FROM +( +select key, 1 as values from inputTbl1 +union all +select * FROM ( + SELECT key, 2 values from inputTbl1 + UNION ALL + SELECT key, 3 as values from inputTbl1 +) a +)b +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_UNION (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME inputTbl1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR 1 values)))) (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_UNION (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME inputTbl1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR 2 values)))) (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME inputTbl1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR 3 values))))) a)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF))))) b)) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME outputTbl1))) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-7 depends on stages: Stage-1 , consists of Stage-4, Stage-3, Stage-5 + Stage-4 + Stage-0 depends on stages: Stage-4, Stage-3, Stage-6 + Stage-2 depends on stages: Stage-0 + Stage-3 + Stage-5 + Stage-6 depends on stages: Stage-5 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + null-subquery1:b-subquery1:inputtbl1 + TableScan + alias: inputtbl1 + Select Operator + expressions: + expr: key + type: string + expr: 1 + type: int + outputColumnNames: _col0, _col1 + Union + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: int + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: UDFToLong(_col1) + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.outputtbl1 + null-subquery2:b-subquery2-subquery1:a-subquery1:inputtbl1 + TableScan + alias: inputtbl1 + Select Operator + expressions: + expr: key + type: string + expr: 2 + type: int + outputColumnNames: _col0, _col1 + Union + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: int + outputColumnNames: _col0, _col1 + Union + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: int + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: UDFToLong(_col1) + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.outputtbl1 + null-subquery2:b-subquery2-subquery2:a-subquery2:inputtbl1 + TableScan + alias: inputtbl1 + Select Operator + expressions: + expr: key + type: string + expr: 3 + type: int + outputColumnNames: _col0, _col1 + Union + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: int + outputColumnNames: _col0, _col1 + Union + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: int + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: UDFToLong(_col1) + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.outputtbl1 + + Stage: Stage-7 + Conditional Operator + + Stage: Stage-4 + Move Operator + files: + hdfs directory: true +#### A masked pattern was here #### + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.outputtbl1 + + Stage: Stage-2 + Stats-Aggr Operator + + Stage: Stage-3 + Block level merge + + Stage: Stage-5 + Block level merge + + Stage: Stage-6 + Move Operator + files: + hdfs directory: true +#### A masked pattern was here #### + + +PREHOOK: query: insert overwrite table outputTbl1 +SELECT * FROM +( +select key, 1 as values from inputTbl1 +union all +select * FROM ( + SELECT key, 2 as values from inputTbl1 + UNION ALL + SELECT key, 3 as values from inputTbl1 +) a +)b +PREHOOK: type: QUERY +PREHOOK: Input: default@inputtbl1 +PREHOOK: Output: default@outputtbl1 +POSTHOOK: query: insert overwrite table outputTbl1 +SELECT * FROM +( +select key, 1 as values from inputTbl1 +union all +select * FROM ( + SELECT key, 2 as values from inputTbl1 + UNION ALL + SELECT key, 3 as values from inputTbl1 +) a +)b +POSTHOOK: type: QUERY +POSTHOOK: Input: default@inputtbl1 +POSTHOOK: Output: default@outputtbl1 +POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.values EXPRESSION [] +PREHOOK: query: desc formatted outputTbl1 +PREHOOK: type: DESCTABLE +POSTHOOK: query: desc formatted outputTbl1 +POSTHOOK: type: DESCTABLE +POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.values EXPRESSION [] +# col_name data_type comment + +key string None +values bigint None + +# Detailed Table Information +Database: default +#### A masked pattern was here #### +Protect Mode: None +Retention: 0 +#### A masked pattern was here #### +Table Type: MANAGED_TABLE +Table Parameters: + numFiles 1 + numPartitions 0 + numRows 18 + rawDataSize 36 + totalSize 115 +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe +InputFormat: org.apache.hadoop.hive.ql.io.RCFileInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.RCFileOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: select * from outputTbl1 +PREHOOK: type: QUERY +PREHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +POSTHOOK: query: select * from outputTbl1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.values EXPRESSION [] +1 2 +1 1 +1 3 +2 2 +2 1 +2 3 +3 2 +3 1 +3 3 +7 2 +7 1 +7 3 +8 2 +8 1 +8 3 +8 2 +8 1 +8 3 Index: ql/src/test/results/clientpositive/union_remove_9.q.out =================================================================== --- ql/src/test/results/clientpositive/union_remove_9.q.out (revision 0) +++ ql/src/test/results/clientpositive/union_remove_9.q.out (working copy) @@ -0,0 +1,323 @@ +PREHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 2 subqueries is performed (one of which contains a union and is map-only), +-- and the other one is a map-reduce query followed by select star and a file sink. +-- There is no need for the outer union. +-- The final file format is different from the input and intermediate file format. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- on + +-- There is a issue with merge for hadoop 20. +-- Till tests start working completely for hadoop 23, making sure that the merge is not +-- triggered, by setting small files average size artificially low + +create table inputTbl1(key string, val string) stored as textfile +PREHOOK: type: CREATETABLE +POSTHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 2 subqueries is performed (one of which contains a union and is map-only), +-- and the other one is a map-reduce query followed by select star and a file sink. +-- There is no need for the outer union. +-- The final file format is different from the input and intermediate file format. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- on + +-- There is a issue with merge for hadoop 20. +-- Till tests start working completely for hadoop 23, making sure that the merge is not +-- triggered, by setting small files average size artificially low + +create table inputTbl1(key string, val string) stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@inputTbl1 +PREHOOK: query: create table outputTbl1(key string, values bigint) stored as rcfile +PREHOOK: type: CREATETABLE +POSTHOOK: query: create table outputTbl1(key string, values bigint) stored as rcfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@outputTbl1 +PREHOOK: query: load data local inpath '../data/files/T1.txt' into table inputTbl1 +PREHOOK: type: LOAD +PREHOOK: Output: default@inputtbl1 +POSTHOOK: query: load data local inpath '../data/files/T1.txt' into table inputTbl1 +POSTHOOK: type: LOAD +POSTHOOK: Output: default@inputtbl1 +PREHOOK: query: explain +insert overwrite table outputTbl1 +SELECT * FROM +( +select key, count(1) as values from inputTbl1 group by key +union all +select * FROM ( + SELECT key, 1 as values from inputTbl1 + UNION ALL + SELECT key, 2 as values from inputTbl1 +) a +)b +PREHOOK: type: QUERY +POSTHOOK: query: explain +insert overwrite table outputTbl1 +SELECT * FROM +( +select key, count(1) as values from inputTbl1 group by key +union all +select * FROM ( + SELECT key, 1 as values from inputTbl1 + UNION ALL + SELECT key, 2 as values from inputTbl1 +) a +)b +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_UNION (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME inputTbl1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count 1) values)) (TOK_GROUPBY (TOK_TABLE_OR_COL key)))) (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_UNION (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME inputTbl1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR 1 values)))) (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME inputTbl1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR 2 values))))) a)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF))))) b)) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME outputTbl1))) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-7 depends on stages: Stage-1, Stage-8 , consists of Stage-4, Stage-3, Stage-5 + Stage-4 + Stage-0 depends on stages: Stage-4, Stage-3, Stage-6 + Stage-2 depends on stages: Stage-0 + Stage-3 + Stage-5 + Stage-6 depends on stages: Stage-5 + Stage-8 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + null-subquery2:b-subquery2-subquery1:a-subquery1:inputtbl1 + TableScan + alias: inputtbl1 + Select Operator + expressions: + expr: key + type: string + expr: 1 + type: int + outputColumnNames: _col0, _col1 + Union + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: int + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.outputtbl1 + null-subquery2:b-subquery2-subquery2:a-subquery2:inputtbl1 + TableScan + alias: inputtbl1 + Select Operator + expressions: + expr: key + type: string + expr: 2 + type: int + outputColumnNames: _col0, _col1 + Union + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: int + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.outputtbl1 + + Stage: Stage-7 + Conditional Operator + + Stage: Stage-4 + Move Operator + files: + hdfs directory: true +#### A masked pattern was here #### + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.outputtbl1 + + Stage: Stage-2 + Stats-Aggr Operator + + Stage: Stage-3 + Block level merge + + Stage: Stage-5 + Block level merge + + Stage: Stage-6 + Move Operator + files: + hdfs directory: true +#### A masked pattern was here #### + + Stage: Stage-8 + Map Reduce + Alias -> Map Operator Tree: + null-subquery1:b-subquery1:inputtbl1 + TableScan + alias: inputtbl1 + Select Operator + expressions: + expr: key + type: string + outputColumnNames: key + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: key + type: string + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: -1 + value expressions: + expr: _col1 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: string + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.outputtbl1 + + +PREHOOK: query: insert overwrite table outputTbl1 +SELECT * FROM +( +select key, count(1) as values from inputTbl1 group by key +union all +select * FROM ( + SELECT key, 1 as values from inputTbl1 + UNION ALL + SELECT key, 2 as values from inputTbl1 +) a +)b +PREHOOK: type: QUERY +PREHOOK: Input: default@inputtbl1 +PREHOOK: Output: default@outputtbl1 +POSTHOOK: query: insert overwrite table outputTbl1 +SELECT * FROM +( +select key, count(1) as values from inputTbl1 group by key +union all +select * FROM ( + SELECT key, 1 as values from inputTbl1 + UNION ALL + SELECT key, 2 as values from inputTbl1 +) a +)b +POSTHOOK: type: QUERY +POSTHOOK: Input: default@inputtbl1 +POSTHOOK: Output: default@outputtbl1 +POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.values EXPRESSION [(inputtbl1)inputtbl1.null, ] +PREHOOK: query: desc formatted outputTbl1 +PREHOOK: type: DESCTABLE +POSTHOOK: query: desc formatted outputTbl1 +POSTHOOK: type: DESCTABLE +POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.values EXPRESSION [(inputtbl1)inputtbl1.null, ] +# col_name data_type comment + +key string None +values bigint None + +# Detailed Table Information +Database: default +#### A masked pattern was here #### +Protect Mode: None +Retention: 0 +#### A masked pattern was here #### +Table Type: MANAGED_TABLE +Table Parameters: + numFiles 2 + numPartitions 0 + numRows 12 + rawDataSize 24 + totalSize 272 +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe +InputFormat: org.apache.hadoop.hive.ql.io.RCFileInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.RCFileOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: select * from outputTbl1 +PREHOOK: type: QUERY +PREHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +POSTHOOK: query: select * from outputTbl1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.values EXPRESSION [(inputtbl1)inputtbl1.null, ] +1 1 +1 2 +2 1 +2 2 +3 1 +3 2 +7 1 +7 2 +8 1 +8 2 +8 1 +8 2 +1 1 +2 1 +3 1 +7 1 +8 2 Index: ql/src/test/results/clientpositive/union_remove_4.q.out =================================================================== --- ql/src/test/results/clientpositive/union_remove_4.q.out (revision 0) +++ ql/src/test/results/clientpositive/union_remove_4.q.out (working copy) @@ -0,0 +1,320 @@ +PREHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 2 map-reduce subqueries is performed followed by select star and a file sink +-- There is no need to write the temporary results of the sub-queries, and then read them +-- again to process the union. The union can be removed completely. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- on + +-- There is a issue with merge for hadoop 20. +-- Till tests start working completely for hadoop 23, making sure that the merge is not +-- triggered, by setting small files average size artificially low + +create table inputTbl1(key string, val string) stored as textfile +PREHOOK: type: CREATETABLE +POSTHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 2 map-reduce subqueries is performed followed by select star and a file sink +-- There is no need to write the temporary results of the sub-queries, and then read them +-- again to process the union. The union can be removed completely. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- on + +-- There is a issue with merge for hadoop 20. +-- Till tests start working completely for hadoop 23, making sure that the merge is not +-- triggered, by setting small files average size artificially low + +create table inputTbl1(key string, val string) stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@inputTbl1 +PREHOOK: query: create table outputTbl1(key string, values bigint) stored as textfile +PREHOOK: type: CREATETABLE +POSTHOOK: query: create table outputTbl1(key string, values bigint) stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@outputTbl1 +PREHOOK: query: load data local inpath '../data/files/T1.txt' into table inputTbl1 +PREHOOK: type: LOAD +PREHOOK: Output: default@inputtbl1 +POSTHOOK: query: load data local inpath '../data/files/T1.txt' into table inputTbl1 +POSTHOOK: type: LOAD +POSTHOOK: Output: default@inputtbl1 +PREHOOK: query: explain +insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values from inputTbl1 group by key +) a +PREHOOK: type: QUERY +POSTHOOK: query: explain +insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values from inputTbl1 group by key +) a +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_UNION (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME inputTbl1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count 1) values)) (TOK_GROUPBY (TOK_TABLE_OR_COL key)))) (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME inputTbl1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count 1) values)) (TOK_GROUPBY (TOK_TABLE_OR_COL key))))) a)) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME outputTbl1))) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-7 depends on stages: Stage-1, Stage-8 , consists of Stage-4, Stage-3, Stage-5 + Stage-4 + Stage-0 depends on stages: Stage-4, Stage-3, Stage-6 + Stage-2 depends on stages: Stage-0 + Stage-3 + Stage-5 + Stage-6 depends on stages: Stage-5 + Stage-8 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + null-subquery2:a-subquery2:inputtbl1 + TableScan + alias: inputtbl1 + Select Operator + expressions: + expr: key + type: string + outputColumnNames: key + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: key + type: string + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: -1 + value expressions: + expr: _col1 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: string + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + + Stage: Stage-7 + Conditional Operator + + Stage: Stage-4 + Move Operator + files: + hdfs directory: true +#### A masked pattern was here #### + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + + Stage: Stage-2 + Stats-Aggr Operator + + Stage: Stage-3 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + + Stage: Stage-5 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + + Stage: Stage-6 + Move Operator + files: + hdfs directory: true +#### A masked pattern was here #### + + Stage: Stage-8 + Map Reduce + Alias -> Map Operator Tree: + null-subquery1:a-subquery1:inputtbl1 + TableScan + alias: inputtbl1 + Select Operator + expressions: + expr: key + type: string + outputColumnNames: key + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: key + type: string + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: -1 + value expressions: + expr: _col1 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: string + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + + +PREHOOK: query: insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values from inputTbl1 group by key +) a +PREHOOK: type: QUERY +PREHOOK: Input: default@inputtbl1 +PREHOOK: Output: default@outputtbl1 +POSTHOOK: query: insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values from inputTbl1 group by key +) a +POSTHOOK: type: QUERY +POSTHOOK: Input: default@inputtbl1 +POSTHOOK: Output: default@outputtbl1 +POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +PREHOOK: query: desc formatted outputTbl1 +PREHOOK: type: DESCTABLE +POSTHOOK: query: desc formatted outputTbl1 +POSTHOOK: type: DESCTABLE +POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +# col_name data_type comment + +key string None +values bigint None + +# Detailed Table Information +Database: default +#### A masked pattern was here #### +Protect Mode: None +Retention: 0 +#### A masked pattern was here #### +Table Type: MANAGED_TABLE +Table Parameters: + numFiles 2 + numPartitions 0 + numRows 5 + rawDataSize 15 + totalSize 272 +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +InputFormat: org.apache.hadoop.mapred.TextInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: select * from outputTbl1 +PREHOOK: type: QUERY +PREHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +POSTHOOK: query: select * from outputTbl1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +1 1 +2 1 +3 1 +7 1 +8 2 +1 1 +2 1 +3 1 +7 1 +8 2 Index: ql/src/test/results/clientpositive/union_remove_13.q.out =================================================================== --- ql/src/test/results/clientpositive/union_remove_13.q.out (revision 0) +++ ql/src/test/results/clientpositive/union_remove_13.q.out (working copy) @@ -0,0 +1,370 @@ +PREHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 2 subqueries is performed (one of which is a mapred query, and the +-- other one is a map-join query), followed by select star and a file sink. +-- The union selectstar optimization should be performed, and the union should be removed. +-- The final file format is different from the input and intermediate file format. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- on + +create table inputTbl1(key string, val string) stored as textfile +PREHOOK: type: CREATETABLE +POSTHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 2 subqueries is performed (one of which is a mapred query, and the +-- other one is a map-join query), followed by select star and a file sink. +-- The union selectstar optimization should be performed, and the union should be removed. +-- The final file format is different from the input and intermediate file format. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- on + +create table inputTbl1(key string, val string) stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@inputTbl1 +PREHOOK: query: create table outputTbl1(key string, values bigint) stored as rcfile +PREHOOK: type: CREATETABLE +POSTHOOK: query: create table outputTbl1(key string, values bigint) stored as rcfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@outputTbl1 +PREHOOK: query: load data local inpath '../data/files/T1.txt' into table inputTbl1 +PREHOOK: type: LOAD +PREHOOK: Output: default@inputtbl1 +POSTHOOK: query: load data local inpath '../data/files/T1.txt' into table inputTbl1 +POSTHOOK: type: LOAD +POSTHOOK: Output: default@inputtbl1 +PREHOOK: query: explain +insert overwrite table outputTbl1 +SELECT * FROM +( +select key, count(1) as values from inputTbl1 group by key +union all +select /*+ mapjoin(a) */ a.key as key, b.val as values +FROM inputTbl1 a join inputTbl1 b on a.key=b.key +)c +PREHOOK: type: QUERY +POSTHOOK: query: explain +insert overwrite table outputTbl1 +SELECT * FROM +( +select key, count(1) as values from inputTbl1 group by key +union all +select /*+ mapjoin(a) */ a.key as key, b.val as values +FROM inputTbl1 a join inputTbl1 b on a.key=b.key +)c +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_UNION (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME inputTbl1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count 1) values)) (TOK_GROUPBY (TOK_TABLE_OR_COL key)))) (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME inputTbl1) a) (TOK_TABREF (TOK_TABNAME inputTbl1) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_HINTLIST (TOK_HINT TOK_MAPJOIN (TOK_HINTARGLIST a))) (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key) key) (TOK_SELEXPR (. (TOK_TABLE_OR_COL b) val) values))))) c)) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME outputTbl1))) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)))) + +STAGE DEPENDENCIES: + Stage-11 is a root stage + Stage-3 depends on stages: Stage-2, Stage-11 + Stage-9 depends on stages: Stage-3 , consists of Stage-6, Stage-5, Stage-7 + Stage-6 + Stage-0 depends on stages: Stage-6, Stage-5, Stage-8 + Stage-4 depends on stages: Stage-0 + Stage-5 + Stage-7 + Stage-8 depends on stages: Stage-7 + Stage-12 is a root stage + Stage-1 depends on stages: Stage-12 + Stage-2 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-11 + Map Reduce + Alias -> Map Operator Tree: + null-subquery1:c-subquery1:inputtbl1 + TableScan + alias: inputtbl1 + Select Operator + expressions: + expr: key + type: string + outputColumnNames: key + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: key + type: string + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: -1 + value expressions: + expr: _col1 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: string + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-3 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + TableScan + Union + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: string + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: UDFToLong(_col1) + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.outputtbl1 +#### A masked pattern was here #### + TableScan + Union + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: string + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: UDFToLong(_col1) + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.outputtbl1 + + Stage: Stage-9 + Conditional Operator + + Stage: Stage-6 + Move Operator + files: + hdfs directory: true +#### A masked pattern was here #### + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.outputtbl1 + + Stage: Stage-4 + Stats-Aggr Operator + + Stage: Stage-5 + Block level merge + + Stage: Stage-7 + Block level merge + + Stage: Stage-8 + Move Operator + files: + hdfs directory: true +#### A masked pattern was here #### + + Stage: Stage-12 + Map Reduce Local Work + Alias -> Map Local Tables: + null-subquery2:c-subquery2:a + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + null-subquery2:c-subquery2:a + TableScan + alias: a + HashTable Sink Operator + condition expressions: + 0 {key} + 1 {val} + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + Position of Big Table: 1 + + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + null-subquery2:c-subquery2:b + TableScan + alias: b + Map Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {key} + 1 {val} + handleSkewJoin: false + keys: + 0 [Column[key]] + 1 [Column[key]] + outputColumnNames: _col0, _col5 + Position of Big Table: 1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + Local Work: + Map Reduce Local Work + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + Select Operator + expressions: + expr: _col0 + type: string + expr: _col5 + type: string + outputColumnNames: _col0, _col5 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col5 + type: string + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + +PREHOOK: query: insert overwrite table outputTbl1 +SELECT * FROM +( +select key, count(1) as values from inputTbl1 group by key +union all +select /*+ mapjoin(a) */ a.key as key, b.val as values +FROM inputTbl1 a join inputTbl1 b on a.key=b.key +)c +PREHOOK: type: QUERY +PREHOOK: Input: default@inputtbl1 +PREHOOK: Output: default@outputtbl1 +POSTHOOK: query: insert overwrite table outputTbl1 +SELECT * FROM +( +select key, count(1) as values from inputTbl1 group by key +union all +select /*+ mapjoin(a) */ a.key as key, b.val as values +FROM inputTbl1 a join inputTbl1 b on a.key=b.key +)c +POSTHOOK: type: QUERY +POSTHOOK: Input: default@inputtbl1 +POSTHOOK: Output: default@outputtbl1 +POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(inputtbl1)a.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.values EXPRESSION [(inputtbl1)b.FieldSchema(name:val, type:string, comment:null), (inputtbl1)inputtbl1.null, ] +PREHOOK: query: desc formatted outputTbl1 +PREHOOK: type: DESCTABLE +POSTHOOK: query: desc formatted outputTbl1 +POSTHOOK: type: DESCTABLE +POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(inputtbl1)a.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.values EXPRESSION [(inputtbl1)b.FieldSchema(name:val, type:string, comment:null), (inputtbl1)inputtbl1.null, ] +# col_name data_type comment + +key string None +values bigint None + +# Detailed Table Information +Database: default +#### A masked pattern was here #### +Protect Mode: None +Retention: 0 +#### A masked pattern was here #### +Table Type: MANAGED_TABLE +Table Parameters: + numFiles 2 + numPartitions 0 + numRows 13 + rawDataSize 39 + totalSize 197 +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe +InputFormat: org.apache.hadoop.hive.ql.io.RCFileInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.RCFileOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: select * from outputTbl1 +PREHOOK: type: QUERY +PREHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +POSTHOOK: query: select * from outputTbl1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(inputtbl1)a.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.values EXPRESSION [(inputtbl1)b.FieldSchema(name:val, type:string, comment:null), (inputtbl1)inputtbl1.null, ] +1 11 +2 12 +3 13 +7 17 +8 18 +8 18 +8 28 +8 28 +1 NULL +2 NULL +3 NULL +7 NULL +8 NULL Index: ql/src/test/results/clientpositive/union_remove_6.q.out =================================================================== --- ql/src/test/results/clientpositive/union_remove_6.q.out (revision 0) +++ ql/src/test/results/clientpositive/union_remove_6.q.out (working copy) @@ -0,0 +1,345 @@ +PREHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 2 subqueries is performed (all of which are mapred queries) +-- followed by select star and a file sink in 2 output tables. +-- The optimiaztion does not take affect since it is a multi-table insert. +-- It does not matter, whether the output is merged or not. In this case, +-- merging is turned off + +create table inputTbl1(key string, val string) stored as textfile +PREHOOK: type: CREATETABLE +POSTHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 2 subqueries is performed (all of which are mapred queries) +-- followed by select star and a file sink in 2 output tables. +-- The optimiaztion does not take affect since it is a multi-table insert. +-- It does not matter, whether the output is merged or not. In this case, +-- merging is turned off + +create table inputTbl1(key string, val string) stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@inputTbl1 +PREHOOK: query: create table outputTbl1(key string, values bigint) stored as textfile +PREHOOK: type: CREATETABLE +POSTHOOK: query: create table outputTbl1(key string, values bigint) stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@outputTbl1 +PREHOOK: query: create table outputTbl2(key string, values bigint) stored as textfile +PREHOOK: type: CREATETABLE +POSTHOOK: query: create table outputTbl2(key string, values bigint) stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@outputTbl2 +PREHOOK: query: load data local inpath '../data/files/T1.txt' into table inputTbl1 +PREHOOK: type: LOAD +PREHOOK: Output: default@inputtbl1 +POSTHOOK: query: load data local inpath '../data/files/T1.txt' into table inputTbl1 +POSTHOOK: type: LOAD +POSTHOOK: Output: default@inputtbl1 +PREHOOK: query: explain +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values from inputTbl1 group by key +) a +insert overwrite table outputTbl1 select * +insert overwrite table outputTbl2 select * +PREHOOK: type: QUERY +POSTHOOK: query: explain +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values from inputTbl1 group by key +) a +insert overwrite table outputTbl1 select * +insert overwrite table outputTbl2 select * +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_UNION (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME inputTbl1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count 1) values)) (TOK_GROUPBY (TOK_TABLE_OR_COL key)))) (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME inputTbl1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count 1) values)) (TOK_GROUPBY (TOK_TABLE_OR_COL key))))) a)) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME outputTbl1))) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF))) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME outputTbl2))) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)))) + +STAGE DEPENDENCIES: + Stage-2 is a root stage + Stage-3 depends on stages: Stage-2, Stage-6 + Stage-0 depends on stages: Stage-3 + Stage-4 depends on stages: Stage-0 + Stage-1 depends on stages: Stage-3 + Stage-5 depends on stages: Stage-1 + Stage-6 is a root stage + +STAGE PLANS: + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: + null-subquery2:a-subquery2:inputtbl1 + TableScan + alias: inputtbl1 + Select Operator + expressions: + expr: key + type: string + outputColumnNames: key + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: key + type: string + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: -1 + value expressions: + expr: _col1 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: string + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-3 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + TableScan + Union + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 2 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl2 +#### A masked pattern was here #### + TableScan + Union + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 2 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl2 + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + + Stage: Stage-4 + Stats-Aggr Operator + + Stage: Stage-1 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl2 + + Stage: Stage-5 + Stats-Aggr Operator + + Stage: Stage-6 + Map Reduce + Alias -> Map Operator Tree: + null-subquery1:a-subquery1:inputtbl1 + TableScan + alias: inputtbl1 + Select Operator + expressions: + expr: key + type: string + outputColumnNames: key + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: key + type: string + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: -1 + value expressions: + expr: _col1 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: string + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + +PREHOOK: query: FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values from inputTbl1 group by key +) a +insert overwrite table outputTbl1 select * +insert overwrite table outputTbl2 select * +PREHOOK: type: QUERY +PREHOOK: Input: default@inputtbl1 +PREHOOK: Output: default@outputtbl1 +PREHOOK: Output: default@outputtbl2 +POSTHOOK: query: FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values from inputTbl1 group by key +) a +insert overwrite table outputTbl1 select * +insert overwrite table outputTbl2 select * +POSTHOOK: type: QUERY +POSTHOOK: Input: default@inputtbl1 +POSTHOOK: Output: default@outputtbl1 +POSTHOOK: Output: default@outputtbl2 +POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +POSTHOOK: Lineage: outputtbl2.key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl2.values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +PREHOOK: query: select * from outputTbl1 +PREHOOK: type: QUERY +PREHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +POSTHOOK: query: select * from outputTbl1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +POSTHOOK: Lineage: outputtbl2.key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl2.values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +1 1 +2 1 +3 1 +7 1 +8 2 +1 1 +2 1 +3 1 +7 1 +8 2 +PREHOOK: query: select * from outputTbl2 +PREHOOK: type: QUERY +PREHOOK: Input: default@outputtbl2 +#### A masked pattern was here #### +POSTHOOK: query: select * from outputTbl2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@outputtbl2 +#### A masked pattern was here #### +POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +POSTHOOK: Lineage: outputtbl2.key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl2.values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +1 1 +2 1 +3 1 +7 1 +8 2 +1 1 +2 1 +3 1 +7 1 +8 2 Index: ql/src/test/results/clientpositive/union_remove_1.q.out =================================================================== --- ql/src/test/results/clientpositive/union_remove_1.q.out (revision 0) +++ ql/src/test/results/clientpositive/union_remove_1.q.out (working copy) @@ -0,0 +1,284 @@ +PREHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 2 map-reduce subqueries is performed followed by select star and a file sink +-- There is no need to write the temporary results of the sub-queries, and then read them +-- again to process the union. The union can be removed completely. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- off + +create table inputTbl1(key string, val string) stored as textfile +PREHOOK: type: CREATETABLE +POSTHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 2 map-reduce subqueries is performed followed by select star and a file sink +-- There is no need to write the temporary results of the sub-queries, and then read them +-- again to process the union. The union can be removed completely. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- off + +create table inputTbl1(key string, val string) stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@inputTbl1 +PREHOOK: query: create table outputTbl1(key string, values bigint) stored as textfile +PREHOOK: type: CREATETABLE +POSTHOOK: query: create table outputTbl1(key string, values bigint) stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@outputTbl1 +PREHOOK: query: load data local inpath '../data/files/T1.txt' into table inputTbl1 +PREHOOK: type: LOAD +PREHOOK: Output: default@inputtbl1 +POSTHOOK: query: load data local inpath '../data/files/T1.txt' into table inputTbl1 +POSTHOOK: type: LOAD +POSTHOOK: Output: default@inputtbl1 +PREHOOK: query: explain +insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values from inputTbl1 group by key +) a +PREHOOK: type: QUERY +POSTHOOK: query: explain +insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values from inputTbl1 group by key +) a +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_UNION (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME inputTbl1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count 1) values)) (TOK_GROUPBY (TOK_TABLE_OR_COL key)))) (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME inputTbl1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count 1) values)) (TOK_GROUPBY (TOK_TABLE_OR_COL key))))) a)) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME outputTbl1))) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1, Stage-3 + Stage-2 depends on stages: Stage-0 + Stage-3 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + null-subquery2:a-subquery2:inputtbl1 + TableScan + alias: inputtbl1 + Select Operator + expressions: + expr: key + type: string + outputColumnNames: key + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: key + type: string + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: -1 + value expressions: + expr: _col1 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: string + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + + Stage: Stage-2 + Stats-Aggr Operator + + Stage: Stage-3 + Map Reduce + Alias -> Map Operator Tree: + null-subquery1:a-subquery1:inputtbl1 + TableScan + alias: inputtbl1 + Select Operator + expressions: + expr: key + type: string + outputColumnNames: key + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: key + type: string + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: -1 + value expressions: + expr: _col1 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: string + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + + +PREHOOK: query: insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values from inputTbl1 group by key +) a +PREHOOK: type: QUERY +PREHOOK: Input: default@inputtbl1 +PREHOOK: Output: default@outputtbl1 +POSTHOOK: query: insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values from inputTbl1 group by key +) a +POSTHOOK: type: QUERY +POSTHOOK: Input: default@inputtbl1 +POSTHOOK: Output: default@outputtbl1 +POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +PREHOOK: query: desc formatted outputTbl1 +PREHOOK: type: DESCTABLE +POSTHOOK: query: desc formatted outputTbl1 +POSTHOOK: type: DESCTABLE +POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +# col_name data_type comment + +key string None +values bigint None + +# Detailed Table Information +Database: default +#### A masked pattern was here #### +Protect Mode: None +Retention: 0 +#### A masked pattern was here #### +Table Type: MANAGED_TABLE +Table Parameters: + numFiles 2 + numPartitions 0 + numRows 5 + rawDataSize 15 + totalSize 272 +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +InputFormat: org.apache.hadoop.mapred.TextInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: -- set mapred.input.dir.recursive=true +-- select count(1) from outputTbl1 +-- Ideally, there should be a query to execute a map-reduce job on outputTbl1. +-- something like, select count(1) from outputTbl1 +-- However, that requires the following +-- MAPREDUCE-1501 with mapred.input.dir.recursive set to true +-- This is not present in hadoop 20. +-- However, when HIVE-3344 is fixed, this test can only be run on hadoop 23 + +select * from outputTbl1 +PREHOOK: type: QUERY +PREHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +POSTHOOK: query: -- set mapred.input.dir.recursive=true +-- select count(1) from outputTbl1 +-- Ideally, there should be a query to execute a map-reduce job on outputTbl1. +-- something like, select count(1) from outputTbl1 +-- However, that requires the following +-- MAPREDUCE-1501 with mapred.input.dir.recursive set to true +-- This is not present in hadoop 20. +-- However, when HIVE-3344 is fixed, this test can only be run on hadoop 23 + +select * from outputTbl1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +1 1 +2 1 +3 1 +7 1 +8 2 +1 1 +2 1 +3 1 +7 1 +8 2 Index: ql/src/test/results/clientpositive/union_remove_15.q.out =================================================================== --- ql/src/test/results/clientpositive/union_remove_15.q.out (revision 0) +++ ql/src/test/results/clientpositive/union_remove_15.q.out (working copy) @@ -0,0 +1,365 @@ +PREHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 2 map-reduce subqueries is performed followed by select star and a file sink +-- and the results are written to a table using dynamic partitions. +-- There is no need to write the temporary results of the sub-queries, and then read them +-- again to process the union. The union can be removed completely. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- off +-- Curently, this optimization does not work in the presence of dynamic partitions. + +create table inputTbl1(key string, val string) stored as textfile +PREHOOK: type: CREATETABLE +POSTHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 2 map-reduce subqueries is performed followed by select star and a file sink +-- and the results are written to a table using dynamic partitions. +-- There is no need to write the temporary results of the sub-queries, and then read them +-- again to process the union. The union can be removed completely. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- off +-- Curently, this optimization does not work in the presence of dynamic partitions. + +create table inputTbl1(key string, val string) stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@inputTbl1 +PREHOOK: query: create table outputTbl1(key string, values bigint) partitioned by (ds string) stored as rcfile +PREHOOK: type: CREATETABLE +POSTHOOK: query: create table outputTbl1(key string, values bigint) partitioned by (ds string) stored as rcfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@outputTbl1 +PREHOOK: query: load data local inpath '../data/files/T1.txt' into table inputTbl1 +PREHOOK: type: LOAD +PREHOOK: Output: default@inputtbl1 +POSTHOOK: query: load data local inpath '../data/files/T1.txt' into table inputTbl1 +POSTHOOK: type: LOAD +POSTHOOK: Output: default@inputtbl1 +PREHOOK: query: explain +insert overwrite table outputTbl1 partition (ds) +SELECT * +FROM ( + SELECT key, count(1) as values, '1' as ds from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values, '2' as ds from inputTbl1 group by key +) a +PREHOOK: type: QUERY +POSTHOOK: query: explain +insert overwrite table outputTbl1 partition (ds) +SELECT * +FROM ( + SELECT key, count(1) as values, '1' as ds from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values, '2' as ds from inputTbl1 group by key +) a +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_UNION (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME inputTbl1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count 1) values) (TOK_SELEXPR '1' ds)) (TOK_GROUPBY (TOK_TABLE_OR_COL key)))) (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME inputTbl1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count 1) values) (TOK_SELEXPR '2' ds)) (TOK_GROUPBY (TOK_TABLE_OR_COL key))))) a)) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME outputTbl1) (TOK_PARTSPEC (TOK_PARTVAL ds)))) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1, Stage-4 + Stage-0 depends on stages: Stage-2 + Stage-3 depends on stages: Stage-0 + Stage-4 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + null-subquery2:a-subquery2:inputtbl1 + TableScan + alias: inputtbl1 + Select Operator + expressions: + expr: key + type: string + outputColumnNames: key + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: key + type: string + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: -1 + value expressions: + expr: _col1 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: string + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + expr: '2' + type: string + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + TableScan + Union + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + expr: _col2 + type: string + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.outputtbl1 +#### A masked pattern was here #### + TableScan + Union + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + expr: _col2 + type: string + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.outputtbl1 + + Stage: Stage-0 + Move Operator + tables: + partition: + ds + replace: true + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.outputtbl1 + + Stage: Stage-3 + Stats-Aggr Operator + + Stage: Stage-4 + Map Reduce + Alias -> Map Operator Tree: + null-subquery1:a-subquery1:inputtbl1 + TableScan + alias: inputtbl1 + Select Operator + expressions: + expr: key + type: string + outputColumnNames: key + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: key + type: string + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: -1 + value expressions: + expr: _col1 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: string + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + expr: '1' + type: string + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + +PREHOOK: query: insert overwrite table outputTbl1 partition (ds) +SELECT * +FROM ( + SELECT key, count(1) as values, '1' as ds from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values, '2' as ds from inputTbl1 group by key +) a +PREHOOK: type: QUERY +PREHOOK: Input: default@inputtbl1 +PREHOOK: Output: default@outputtbl1 +POSTHOOK: query: insert overwrite table outputTbl1 partition (ds) +SELECT * +FROM ( + SELECT key, count(1) as values, '1' as ds from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values, '2' as ds from inputTbl1 group by key +) a +POSTHOOK: type: QUERY +POSTHOOK: Input: default@inputtbl1 +POSTHOOK: Output: default@outputtbl1@ds=1 +POSTHOOK: Output: default@outputtbl1@ds=2 +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=1).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=1).values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=2).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=2).values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +PREHOOK: query: desc formatted outputTbl1 +PREHOOK: type: DESCTABLE +POSTHOOK: query: desc formatted outputTbl1 +POSTHOOK: type: DESCTABLE +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=1).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=1).values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=2).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=2).values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +# col_name data_type comment + +key string None +values bigint None + +# Partition Information +# col_name data_type comment + +ds string None + +# Detailed Table Information +Database: default +#### A masked pattern was here #### +Protect Mode: None +Retention: 0 +#### A masked pattern was here #### +Table Type: MANAGED_TABLE +Table Parameters: + numFiles 2 + numPartitions 2 + numRows 10 + rawDataSize 20 + totalSize 178 +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe +InputFormat: org.apache.hadoop.hive.ql.io.RCFileInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.RCFileOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: show partitions outputTbl1 +PREHOOK: type: SHOWPARTITIONS +POSTHOOK: query: show partitions outputTbl1 +POSTHOOK: type: SHOWPARTITIONS +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=1).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=1).values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=2).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=2).values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +ds=1 +ds=2 +PREHOOK: query: -- Ideally, there should be a query to execute a map-reduce job on outputTbl1. +-- something like, select count(1) from outputTbl1 +-- However, that requires the following +-- MAPREDUCE-1501 with mapred.input.dir.recursive set to true +-- This is not present in hadoop 20. +-- However, when HIVE-3344 is fixed, this test can only be run on hadoop 23 + +select * from outputTbl1 where ds = '1' +PREHOOK: type: QUERY +PREHOOK: Input: default@outputtbl1@ds=1 +#### A masked pattern was here #### +POSTHOOK: query: -- Ideally, there should be a query to execute a map-reduce job on outputTbl1. +-- something like, select count(1) from outputTbl1 +-- However, that requires the following +-- MAPREDUCE-1501 with mapred.input.dir.recursive set to true +-- This is not present in hadoop 20. +-- However, when HIVE-3344 is fixed, this test can only be run on hadoop 23 + +select * from outputTbl1 where ds = '1' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@outputtbl1@ds=1 +#### A masked pattern was here #### +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=1).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=1).values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=2).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=2).values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +1 1 1 +2 1 1 +3 1 1 +7 1 1 +8 2 1 +PREHOOK: query: select * from outputTbl1 where ds = '2' +PREHOOK: type: QUERY +PREHOOK: Input: default@outputtbl1@ds=2 +#### A masked pattern was here #### +POSTHOOK: query: select * from outputTbl1 where ds = '2' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@outputtbl1@ds=2 +#### A masked pattern was here #### +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=1).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=1).values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=2).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=2).values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +1 1 2 +2 1 2 +3 1 2 +7 1 2 +8 2 2 Index: ql/src/test/results/clientpositive/union_remove_10.q.out =================================================================== --- ql/src/test/results/clientpositive/union_remove_10.q.out (revision 0) +++ ql/src/test/results/clientpositive/union_remove_10.q.out (working copy) @@ -0,0 +1,340 @@ +PREHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 2 subqueries is performed (one of which is a map-only query, and the +-- other one contains a nested union where one of the sub-queries requires a map-reduce +-- job), followed by select star and a file sink. +-- There is no need to write the temporary results of the sub-queries, and then read them +-- again to process the union. The outer union can be removed completely. +-- The final file format is different from the input and intermediate file format. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- on + +-- There is a issue with merge for hadoop 20. +-- Till tests start working completely for hadoop 23, making sure that the merge is not +-- triggered, by setting small files average size artificially low + +create table inputTbl1(key string, val string) stored as textfile +PREHOOK: type: CREATETABLE +POSTHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 2 subqueries is performed (one of which is a map-only query, and the +-- other one contains a nested union where one of the sub-queries requires a map-reduce +-- job), followed by select star and a file sink. +-- There is no need to write the temporary results of the sub-queries, and then read them +-- again to process the union. The outer union can be removed completely. +-- The final file format is different from the input and intermediate file format. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- on + +-- There is a issue with merge for hadoop 20. +-- Till tests start working completely for hadoop 23, making sure that the merge is not +-- triggered, by setting small files average size artificially low + +create table inputTbl1(key string, val string) stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@inputTbl1 +PREHOOK: query: create table outputTbl1(key string, values bigint) stored as rcfile +PREHOOK: type: CREATETABLE +POSTHOOK: query: create table outputTbl1(key string, values bigint) stored as rcfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@outputTbl1 +PREHOOK: query: load data local inpath '../data/files/T1.txt' into table inputTbl1 +PREHOOK: type: LOAD +PREHOOK: Output: default@inputtbl1 +POSTHOOK: query: load data local inpath '../data/files/T1.txt' into table inputTbl1 +POSTHOOK: type: LOAD +POSTHOOK: Output: default@inputtbl1 +PREHOOK: query: explain +insert overwrite table outputTbl1 +SELECT * FROM +( +select key, 1 as values from inputTbl1 +union all +select * FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, 2 as values from inputTbl1 +) a +)b +PREHOOK: type: QUERY +POSTHOOK: query: explain +insert overwrite table outputTbl1 +SELECT * FROM +( +select key, 1 as values from inputTbl1 +union all +select * FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, 2 as values from inputTbl1 +) a +)b +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_UNION (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME inputTbl1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR 1 values)))) (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_UNION (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME inputTbl1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count 1) values)) (TOK_GROUPBY (TOK_TABLE_OR_COL key)))) (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME inputTbl1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR 2 values))))) a)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF))))) b)) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME outputTbl1))) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-8 depends on stages: Stage-2, Stage-9 , consists of Stage-5, Stage-4, Stage-6 + Stage-5 + Stage-0 depends on stages: Stage-5, Stage-4, Stage-7 + Stage-3 depends on stages: Stage-0 + Stage-4 + Stage-6 + Stage-7 depends on stages: Stage-6 + Stage-9 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + null-subquery2:b-subquery2-subquery1:a-subquery1:inputtbl1 + TableScan + alias: inputtbl1 + Select Operator + expressions: + expr: key + type: string + outputColumnNames: key + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: key + type: string + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: -1 + value expressions: + expr: _col1 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: string + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + TableScan + Union + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.outputtbl1 + null-subquery2:b-subquery2-subquery2:a-subquery2:inputtbl1 + TableScan + alias: inputtbl1 + Select Operator + expressions: + expr: key + type: string + expr: 2 + type: int + outputColumnNames: _col0, _col1 + Union + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.outputtbl1 + + Stage: Stage-8 + Conditional Operator + + Stage: Stage-5 + Move Operator + files: + hdfs directory: true +#### A masked pattern was here #### + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.outputtbl1 + + Stage: Stage-3 + Stats-Aggr Operator + + Stage: Stage-4 + Block level merge + + Stage: Stage-6 + Block level merge + + Stage: Stage-7 + Move Operator + files: + hdfs directory: true +#### A masked pattern was here #### + + Stage: Stage-9 + Map Reduce + Alias -> Map Operator Tree: + null-subquery1:b-subquery1:inputtbl1 + TableScan + alias: inputtbl1 + Select Operator + expressions: + expr: key + type: string + expr: 1 + type: int + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.outputtbl1 + + +PREHOOK: query: insert overwrite table outputTbl1 +SELECT * FROM +( +select key, 1 as values from inputTbl1 +union all +select * FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, 2 as values from inputTbl1 +) a +)b +PREHOOK: type: QUERY +PREHOOK: Input: default@inputtbl1 +PREHOOK: Output: default@outputtbl1 +POSTHOOK: query: insert overwrite table outputTbl1 +SELECT * FROM +( +select key, 1 as values from inputTbl1 +union all +select * FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, 2 as values from inputTbl1 +) a +)b +POSTHOOK: type: QUERY +POSTHOOK: Input: default@inputtbl1 +POSTHOOK: Output: default@outputtbl1 +POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.values EXPRESSION [(inputtbl1)inputtbl1.null, ] +PREHOOK: query: desc formatted outputTbl1 +PREHOOK: type: DESCTABLE +POSTHOOK: query: desc formatted outputTbl1 +POSTHOOK: type: DESCTABLE +POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.values EXPRESSION [(inputtbl1)inputtbl1.null, ] +# col_name data_type comment + +key string None +values bigint None + +# Detailed Table Information +Database: default +#### A masked pattern was here #### +Protect Mode: None +Retention: 0 +#### A masked pattern was here #### +Table Type: MANAGED_TABLE +Table Parameters: + numFiles 2 + numPartitions 0 + numRows 11 + rawDataSize 22 + totalSize 340 +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe +InputFormat: org.apache.hadoop.hive.ql.io.RCFileInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.RCFileOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: select * from outputTbl1 +PREHOOK: type: QUERY +PREHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +POSTHOOK: query: select * from outputTbl1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.values EXPRESSION [(inputtbl1)inputtbl1.null, ] +1 1 +2 1 +3 1 +7 1 +8 1 +8 1 +1 1 +2 1 +3 1 +7 1 +8 2 +1 2 +2 2 +3 2 +7 2 +8 2 +8 2 Index: ql/src/test/results/clientpositive/union_remove_8.q.out =================================================================== --- ql/src/test/results/clientpositive/union_remove_8.q.out (revision 0) +++ ql/src/test/results/clientpositive/union_remove_8.q.out (working copy) @@ -0,0 +1,272 @@ +PREHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 3 subqueries is performed (exactly one of which requires a map-reduce job) +-- followed by select star and a file sink. +-- There is no need to write the temporary results of the sub-queries, and then read them +-- again to process the union. The union can be removed completely. +-- The final file format is different from the input and intermediate file format. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- off + +create table inputTbl1(key string, val string) stored as textfile +PREHOOK: type: CREATETABLE +POSTHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 3 subqueries is performed (exactly one of which requires a map-reduce job) +-- followed by select star and a file sink. +-- There is no need to write the temporary results of the sub-queries, and then read them +-- again to process the union. The union can be removed completely. +-- The final file format is different from the input and intermediate file format. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- off + +create table inputTbl1(key string, val string) stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@inputTbl1 +PREHOOK: query: create table outputTbl1(key string, values bigint) stored as rcfile +PREHOOK: type: CREATETABLE +POSTHOOK: query: create table outputTbl1(key string, values bigint) stored as rcfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@outputTbl1 +PREHOOK: query: load data local inpath '../data/files/T1.txt' into table inputTbl1 +PREHOOK: type: LOAD +PREHOOK: Output: default@inputtbl1 +POSTHOOK: query: load data local inpath '../data/files/T1.txt' into table inputTbl1 +POSTHOOK: type: LOAD +POSTHOOK: Output: default@inputtbl1 +PREHOOK: query: explain +insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, 1 as values from inputTbl1 + UNION ALL + SELECT key, 2 as values from inputTbl1 +) a +PREHOOK: type: QUERY +POSTHOOK: query: explain +insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, 1 as values from inputTbl1 + UNION ALL + SELECT key, 2 as values from inputTbl1 +) a +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_UNION (TOK_UNION (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME inputTbl1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count 1) values)) (TOK_GROUPBY (TOK_TABLE_OR_COL key)))) (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME inputTbl1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR 1 values))))) (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME inputTbl1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR 2 values))))) a)) (TOK_INSERT (TOK_DESTINATION (TOK_TAB (TOK_TABNAME outputTbl1))) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1, Stage-3, Stage-4 + Stage-2 depends on stages: Stage-0 + Stage-3 is a root stage + Stage-4 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + null-subquery2:a-subquery2:inputtbl1 + TableScan + alias: inputtbl1 + Select Operator + expressions: + expr: key + type: string + expr: 2 + type: int + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.outputtbl1 + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.outputtbl1 + + Stage: Stage-2 + Stats-Aggr Operator + + Stage: Stage-3 + Map Reduce + Alias -> Map Operator Tree: + null-subquery1-subquery1:a-subquery1-subquery1:inputtbl1 + TableScan + alias: inputtbl1 + Select Operator + expressions: + expr: key + type: string + outputColumnNames: key + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: key + type: string + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: -1 + value expressions: + expr: _col1 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: string + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.outputtbl1 + + Stage: Stage-4 + Map Reduce + Alias -> Map Operator Tree: + null-subquery1-subquery2:a-subquery1-subquery2:inputtbl1 + TableScan + alias: inputtbl1 + Select Operator + expressions: + expr: key + type: string + expr: 1 + type: int + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 1 + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.outputtbl1 + + +PREHOOK: query: insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, 1 as values from inputTbl1 + UNION ALL + SELECT key, 2 as values from inputTbl1 +) a +PREHOOK: type: QUERY +PREHOOK: Input: default@inputtbl1 +PREHOOK: Output: default@outputtbl1 +POSTHOOK: query: insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, 1 as values from inputTbl1 + UNION ALL + SELECT key, 2 as values from inputTbl1 +) a +POSTHOOK: type: QUERY +POSTHOOK: Input: default@inputtbl1 +POSTHOOK: Output: default@outputtbl1 +POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.values EXPRESSION [(inputtbl1)inputtbl1.null, ] +PREHOOK: query: desc formatted outputTbl1 +PREHOOK: type: DESCTABLE +POSTHOOK: query: desc formatted outputTbl1 +POSTHOOK: type: DESCTABLE +POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.values EXPRESSION [(inputtbl1)inputtbl1.null, ] +# col_name data_type comment + +key string None +values bigint None + +# Detailed Table Information +Database: default +#### A masked pattern was here #### +Protect Mode: None +Retention: 0 +#### A masked pattern was here #### +Table Type: MANAGED_TABLE +Table Parameters: + numFiles 3 + numPartitions 0 + numRows 6 + rawDataSize 12 + totalSize 408 +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe +InputFormat: org.apache.hadoop.hive.ql.io.RCFileInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.RCFileOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: select * from outputTbl1 +PREHOOK: type: QUERY +PREHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +POSTHOOK: query: select * from outputTbl1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.values EXPRESSION [(inputtbl1)inputtbl1.null, ] +1 2 +2 2 +3 2 +7 2 +8 2 +8 2 +1 1 +2 1 +3 1 +7 1 +8 2 +1 1 +2 1 +3 1 +7 1 +8 1 +8 1 Index: ql/src/test/queries/clientpositive/union_remove_11.q =================================================================== --- ql/src/test/queries/clientpositive/union_remove_11.q (revision 0) +++ ql/src/test/queries/clientpositive/union_remove_11.q (working copy) @@ -0,0 +1,53 @@ +set hive.optimize.union.selectstar=true; + +set hive.merge.mapfiles=true; +set hive.merge.mapredfiles=true; +set hive.merge.smallfiles.avgsize=1; + +-- This is to test the union->selectstar->filesink optimization +-- Union of 2 subqueries is performed (one of which is a map-only query, and the +-- other one contains a nested union where also contains map only sub-queries), +-- followed by select star and a file sink. +-- There is no need for the union optimization, since the whole query can be performed +-- in a single map-only job +-- The final file format is different from the input and intermediate file format. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- on + +-- There is a issue with merge for hadoop 20. +-- Till tests start working completely for hadoop 23, making sure that the merge is not +-- triggered, by setting small files average size artificially low + +create table inputTbl1(key string, val string) stored as textfile; +create table outputTbl1(key string, values bigint) stored as rcfile; + +load data local inpath '../data/files/T1.txt' into table inputTbl1; + +explain +insert overwrite table outputTbl1 +SELECT * FROM +( +select key, 1 as values from inputTbl1 +union all +select * FROM ( + SELECT key, 2 values from inputTbl1 + UNION ALL + SELECT key, 3 as values from inputTbl1 +) a +)b; + +insert overwrite table outputTbl1 +SELECT * FROM +( +select key, 1 as values from inputTbl1 +union all +select * FROM ( + SELECT key, 2 as values from inputTbl1 + UNION ALL + SELECT key, 3 as values from inputTbl1 +) a +)b; + +desc formatted outputTbl1; + +select * from outputTbl1; Index: ql/src/test/queries/clientpositive/union_remove_6.q =================================================================== --- ql/src/test/queries/clientpositive/union_remove_6.q (revision 0) +++ ql/src/test/queries/clientpositive/union_remove_6.q (working copy) @@ -0,0 +1,37 @@ +set hive.optimize.union.selectstar=true; + +set hive.merge.mapfiles=false; +set hive.merge.mapredfiles=false; + +-- This is to test the union->selectstar->filesink optimization +-- Union of 2 subqueries is performed (all of which are mapred queries) +-- followed by select star and a file sink in 2 output tables. +-- The optimiaztion does not take affect since it is a multi-table insert. +-- It does not matter, whether the output is merged or not. In this case, +-- merging is turned off + +create table inputTbl1(key string, val string) stored as textfile; +create table outputTbl1(key string, values bigint) stored as textfile; +create table outputTbl2(key string, values bigint) stored as textfile; + +load data local inpath '../data/files/T1.txt' into table inputTbl1; + +explain +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values from inputTbl1 group by key +) a +insert overwrite table outputTbl1 select * +insert overwrite table outputTbl2 select *; + +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values from inputTbl1 group by key +) a +insert overwrite table outputTbl1 select * +insert overwrite table outputTbl2 select *; + +select * from outputTbl1; +select * from outputTbl2; Index: ql/src/test/queries/clientpositive/union_remove_13.q =================================================================== --- ql/src/test/queries/clientpositive/union_remove_13.q (revision 0) +++ ql/src/test/queries/clientpositive/union_remove_13.q (working copy) @@ -0,0 +1,41 @@ +set hive.optimize.union.selectstar=true; + +set hive.merge.mapfiles=true; +set hive.merge.mapredfiles=true; +set hive.merge.smallfiles.avgsize=1; + +-- This is to test the union->selectstar->filesink optimization +-- Union of 2 subqueries is performed (one of which is a mapred query, and the +-- other one is a map-join query), followed by select star and a file sink. +-- The union selectstar optimization should be performed, and the union should be removed. +-- The final file format is different from the input and intermediate file format. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- on + +create table inputTbl1(key string, val string) stored as textfile; +create table outputTbl1(key string, values bigint) stored as rcfile; + +load data local inpath '../data/files/T1.txt' into table inputTbl1; + +explain +insert overwrite table outputTbl1 +SELECT * FROM +( +select key, count(1) as values from inputTbl1 group by key +union all +select /*+ mapjoin(a) */ a.key as key, b.val as values +FROM inputTbl1 a join inputTbl1 b on a.key=b.key +)c; + +insert overwrite table outputTbl1 +SELECT * FROM +( +select key, count(1) as values from inputTbl1 group by key +union all +select /*+ mapjoin(a) */ a.key as key, b.val as values +FROM inputTbl1 a join inputTbl1 b on a.key=b.key +)c; + +desc formatted outputTbl1; + +select * from outputTbl1; Index: ql/src/test/queries/clientpositive/union_remove_8.q =================================================================== --- ql/src/test/queries/clientpositive/union_remove_8.q (revision 0) +++ ql/src/test/queries/clientpositive/union_remove_8.q (working copy) @@ -0,0 +1,43 @@ +set hive.optimize.union.selectstar=true; + +set hive.merge.mapfiles=false; +set hive.merge.mapredfiles=false; + +-- This is to test the union->selectstar->filesink optimization +-- Union of 3 subqueries is performed (exactly one of which requires a map-reduce job) +-- followed by select star and a file sink. +-- There is no need to write the temporary results of the sub-queries, and then read them +-- again to process the union. The union can be removed completely. +-- The final file format is different from the input and intermediate file format. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- off + +create table inputTbl1(key string, val string) stored as textfile; +create table outputTbl1(key string, values bigint) stored as rcfile; + +load data local inpath '../data/files/T1.txt' into table inputTbl1; + +explain +insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, 1 as values from inputTbl1 + UNION ALL + SELECT key, 2 as values from inputTbl1 +) a; + +insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, 1 as values from inputTbl1 + UNION ALL + SELECT key, 2 as values from inputTbl1 +) a; + +desc formatted outputTbl1; + +select * from outputTbl1; Index: ql/src/test/queries/clientpositive/union_remove_15.q =================================================================== --- ql/src/test/queries/clientpositive/union_remove_15.q (revision 0) +++ ql/src/test/queries/clientpositive/union_remove_15.q (working copy) @@ -0,0 +1,52 @@ +set hive.optimize.union.selectstar=true; + +set hive.merge.mapfiles=false; +set hive.merge.mapredfiles=false; + +set hive.exec.dynamic.partition.mode=nonstrict; +set hive.exec.dynamic.partition=true; + +-- This is to test the union->selectstar->filesink optimization +-- Union of 2 map-reduce subqueries is performed followed by select star and a file sink +-- and the results are written to a table using dynamic partitions. +-- There is no need to write the temporary results of the sub-queries, and then read them +-- again to process the union. The union can be removed completely. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- off +-- Curently, this optimization does not work in the presence of dynamic partitions. + +create table inputTbl1(key string, val string) stored as textfile; +create table outputTbl1(key string, values bigint) partitioned by (ds string) stored as rcfile; + +load data local inpath '../data/files/T1.txt' into table inputTbl1; + +explain +insert overwrite table outputTbl1 partition (ds) +SELECT * +FROM ( + SELECT key, count(1) as values, '1' as ds from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values, '2' as ds from inputTbl1 group by key +) a; + +insert overwrite table outputTbl1 partition (ds) +SELECT * +FROM ( + SELECT key, count(1) as values, '1' as ds from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values, '2' as ds from inputTbl1 group by key +) a; + +desc formatted outputTbl1; + +show partitions outputTbl1; + +-- Ideally, there should be a query to execute a map-reduce job on outputTbl1. +-- something like, select count(1) from outputTbl1 +-- However, that requires the following +-- MAPREDUCE-1501 with mapred.input.dir.recursive set to true +-- This is not present in hadoop 20. +-- However, when HIVE-3344 is fixed, this test can only be run on hadoop 23 + +select * from outputTbl1 where ds = '1'; +select * from outputTbl1 where ds = '2'; Index: ql/src/test/queries/clientpositive/union_remove_17.q =================================================================== --- ql/src/test/queries/clientpositive/union_remove_17.q (revision 0) +++ ql/src/test/queries/clientpositive/union_remove_17.q (working copy) @@ -0,0 +1,49 @@ +set hive.optimize.union.selectstar=true; + +set hive.merge.mapfiles=false; +set hive.merge.mapredfiles=false; + +set hive.exec.dynamic.partition.mode=nonstrict; +set hive.exec.dynamic.partition=true; + +-- This is to test the union->selectstar->filesink optimization +-- Union of 2 map-reduce subqueries is performed followed by select star and a file sink +-- and the results are written to a table using dynamic partitions. +-- There is no need for this optimization, since the query is a map-only query. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- off + +create table inputTbl1(key string, val string) stored as textfile; +create table outputTbl1(key string, values bigint) partitioned by (ds string) stored as rcfile; + +load data local inpath '../data/files/T1.txt' into table inputTbl1; + +explain +insert overwrite table outputTbl1 partition (ds) +SELECT * +FROM ( + SELECT key, 1 as values, '1' as ds from inputTbl1 + UNION ALL + SELECT key, 2 as values, '2' as ds from inputTbl1 +) a; + +insert overwrite table outputTbl1 partition (ds) +SELECT * +FROM ( + SELECT key, 1 as values, '1' as ds from inputTbl1 + UNION ALL + SELECT key, 2 as values, '2' as ds from inputTbl1 +) a; + +desc formatted outputTbl1; +show partitions outputTbl1; + +-- Ideally, there should be a query to execute a map-reduce job on outputTbl1. +-- something like, select count(1) from outputTbl1 +-- However, that requires the following +-- MAPREDUCE-1501 with mapred.input.dir.recursive set to true +-- This is not present in hadoop 20. +-- However, when HIVE-3344 is fixed, this test can only be run on hadoop 23 + +select * from outputTbl1 where ds = '1'; +select * from outputTbl1 where ds = '2'; Index: ql/src/test/queries/clientpositive/union_remove_1.q =================================================================== --- ql/src/test/queries/clientpositive/union_remove_1.q (revision 0) +++ ql/src/test/queries/clientpositive/union_remove_1.q (working copy) @@ -0,0 +1,46 @@ +set hive.optimize.union.selectstar=true; + +set hive.merge.mapfiles=false; +set hive.merge.mapredfiles=false; + +-- This is to test the union->selectstar->filesink optimization +-- Union of 2 map-reduce subqueries is performed followed by select star and a file sink +-- There is no need to write the temporary results of the sub-queries, and then read them +-- again to process the union. The union can be removed completely. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- off + +create table inputTbl1(key string, val string) stored as textfile; +create table outputTbl1(key string, values bigint) stored as textfile; + +load data local inpath '../data/files/T1.txt' into table inputTbl1; + +explain +insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values from inputTbl1 group by key +) a; + +insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values from inputTbl1 group by key +) a; + +desc formatted outputTbl1; + +-- set mapred.input.dir.recursive=true +-- select count(1) from outputTbl1 +-- Ideally, there should be a query to execute a map-reduce job on outputTbl1. +-- something like, select count(1) from outputTbl1 +-- However, that requires the following +-- MAPREDUCE-1501 with mapred.input.dir.recursive set to true +-- This is not present in hadoop 20. +-- However, when HIVE-3344 is fixed, this test can only be run on hadoop 23 + +select * from outputTbl1; Index: ql/src/test/queries/clientpositive/union_remove_3.q =================================================================== --- ql/src/test/queries/clientpositive/union_remove_3.q (revision 0) +++ ql/src/test/queries/clientpositive/union_remove_3.q (working copy) @@ -0,0 +1,49 @@ +set hive.optimize.union.selectstar=true; + +set hive.merge.mapfiles=false; +set hive.merge.mapredfiles=false; + +-- This is to test the union->selectstar->filesink optimization +-- Union of 3 subqueries is performed (all of which are map-only queries) +-- followed by select star and a file sink. +-- There is no need for any optimization, since the whole query can be processed in +-- a single map-only job +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- off + +create table inputTbl1(key string, val string) stored as textfile; +create table outputTbl1(key string, values bigint) stored as textfile; + +load data local inpath '../data/files/T1.txt' into table inputTbl1; + +explain +insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, 1 as values from inputTbl1 + UNION ALL + SELECT key, 2 as values from inputTbl1 + UNION ALL + SELECT key, 3 as values from inputTbl1 +) a; + +insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, 1 as values from inputTbl1 + UNION ALL + SELECT key, 2 as values from inputTbl1 + UNION ALL + SELECT key, 3 as values from inputTbl1 +) a; + +desc formatted outputTbl1; + +-- Ideally, there should be a query to execute a map-reduce job on outputTbl1. +-- something like, select count(1) from outputTbl1 +-- However, that requires the following +-- MAPREDUCE-1501 with mapred.input.dir.recursive set to true +-- This is not present in hadoop 20. +-- However, when HIVE-3344 is fixed, this test can only be run on hadoop 23 + +select * from outputTbl1; Index: ql/src/test/queries/clientpositive/union_remove_10.q =================================================================== --- ql/src/test/queries/clientpositive/union_remove_10.q (revision 0) +++ ql/src/test/queries/clientpositive/union_remove_10.q (working copy) @@ -0,0 +1,53 @@ +set hive.optimize.union.selectstar=true; + +set hive.merge.mapfiles=true; +set hive.merge.mapredfiles=true; +set hive.merge.smallfiles.avgsize=1; + +-- This is to test the union->selectstar->filesink optimization +-- Union of 2 subqueries is performed (one of which is a map-only query, and the +-- other one contains a nested union where one of the sub-queries requires a map-reduce +-- job), followed by select star and a file sink. +-- There is no need to write the temporary results of the sub-queries, and then read them +-- again to process the union. The outer union can be removed completely. +-- The final file format is different from the input and intermediate file format. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- on + +-- There is a issue with merge for hadoop 20. +-- Till tests start working completely for hadoop 23, making sure that the merge is not +-- triggered, by setting small files average size artificially low + +create table inputTbl1(key string, val string) stored as textfile; +create table outputTbl1(key string, values bigint) stored as rcfile; + +load data local inpath '../data/files/T1.txt' into table inputTbl1; + +explain +insert overwrite table outputTbl1 +SELECT * FROM +( +select key, 1 as values from inputTbl1 +union all +select * FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, 2 as values from inputTbl1 +) a +)b; + +insert overwrite table outputTbl1 +SELECT * FROM +( +select key, 1 as values from inputTbl1 +union all +select * FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, 2 as values from inputTbl1 +) a +)b; + +desc formatted outputTbl1; + +select * from outputTbl1; Index: ql/src/test/queries/clientpositive/union_remove_5.q =================================================================== --- ql/src/test/queries/clientpositive/union_remove_5.q (revision 0) +++ ql/src/test/queries/clientpositive/union_remove_5.q (working copy) @@ -0,0 +1,47 @@ +set hive.optimize.union.selectstar=true; + +set hive.merge.mapfiles=true; +set hive.merge.mapredfiles=true; +set hive.merge.smallfiles.avgsize=1; + +-- This is to test the union->selectstar->filesink optimization +-- Union of 3 subqueries is performed (exactly one of which requires a map-reduce job) +-- followed by select star and a file sink. +-- There is no need to write the temporary results of the sub-queries, and then read them +-- again to process the union. The union can be removed completely. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- on + +-- There is a issue with merge for hadoop 20. +-- Till tests start working completely for hadoop 23, making sure that the merge is not +-- triggered, by setting small files average size artificially low + +create table inputTbl1(key string, val string) stored as textfile; +create table outputTbl1(key string, values bigint) stored as textfile; + +load data local inpath '../data/files/T1.txt' into table inputTbl1; + +explain +insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, 1 as values from inputTbl1 + UNION ALL + SELECT key, 2 as values from inputTbl1 +) a; + +insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, 1 as values from inputTbl1 + UNION ALL + SELECT key, 2 as values from inputTbl1 +) a; + +desc formatted outputTbl1; + +select * from outputTbl1; Index: ql/src/test/queries/clientpositive/union_remove_12.q =================================================================== --- ql/src/test/queries/clientpositive/union_remove_12.q (revision 0) +++ ql/src/test/queries/clientpositive/union_remove_12.q (working copy) @@ -0,0 +1,42 @@ +set hive.optimize.union.selectstar=true; + +set hive.merge.mapfiles=true; +set hive.merge.mapredfiles=true; +set hive.merge.smallfiles.avgsize=1; + +-- This is to test the union->selectstar->filesink optimization +-- Union of 2 subqueries is performed (one of which is a map-only query, and the +-- other one is a map-join query), followed by select star and a file sink. +-- There is no need for the union optimization, since the whole query can be performed +-- in a single map-only job +-- The final file format is different from the input and intermediate file format. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- on + +create table inputTbl1(key string, val string) stored as textfile; +create table outputTbl1(key string, values bigint) stored as rcfile; + +load data local inpath '../data/files/T1.txt' into table inputTbl1; + +explain +insert overwrite table outputTbl1 +SELECT * FROM +( +select key, 1 as values from inputTbl1 +union all +select /*+ mapjoin(a) */ a.key as key, b.val as values +FROM inputTbl1 a join inputTbl1 b on a.key=b.key +)c; + +insert overwrite table outputTbl1 +SELECT * FROM +( +select key, 1 as values from inputTbl1 +union all +select /*+ mapjoin(a) */ a.key as key, b.val as values +FROM inputTbl1 a join inputTbl1 b on a.key=b.key +)c; + +desc formatted outputTbl1; + +select * from outputTbl1; Index: ql/src/test/queries/clientpositive/union_remove_7.q =================================================================== --- ql/src/test/queries/clientpositive/union_remove_7.q (revision 0) +++ ql/src/test/queries/clientpositive/union_remove_7.q (working copy) @@ -0,0 +1,38 @@ +set hive.optimize.union.selectstar=true; + +set hive.merge.mapfiles=false; +set hive.merge.mapredfiles=false; + +-- This is to test the union->selectstar->filesink optimization +-- Union of 2 map-reduce subqueries is performed followed by select star and a file sink +-- There is no need to write the temporary results of the sub-queries, and then read them +-- again to process the union. The union can be removed completely. +-- The final file format is different from the input and intermediate file format. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- off + +create table inputTbl1(key string, val string) stored as textfile; +create table outputTbl1(key string, values bigint) stored as rcfile; + +load data local inpath '../data/files/T1.txt' into table inputTbl1; + +explain +insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values from inputTbl1 group by key +) a; + +insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values from inputTbl1 group by key +) a; + +desc formatted outputTbl1; + +select * from outputTbl1; Index: ql/src/test/queries/clientpositive/union_remove_14.q =================================================================== --- ql/src/test/queries/clientpositive/union_remove_14.q (revision 0) +++ ql/src/test/queries/clientpositive/union_remove_14.q (working copy) @@ -0,0 +1,43 @@ +set hive.optimize.union.selectstar=true; + +set hive.merge.mapfiles=true; +set hive.merge.mapredfiles=true; +set hive.auto.convert.join=true; +set hive.merge.smallfiles.avgsize=1; + +-- This is to test the union->selectstar->filesink optimization +-- Union of 2 subqueries is performed (one of which is a map-only query, and the +-- other one contains a join, which should be performed as a map-join query at runtime), +-- followed by select star and a file sink. +-- The union selectstar optimization should be performed, and the union should be removed. +-- The final file format is different from the input and intermediate file format. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- on + +create table inputTbl1(key string, val string) stored as textfile; +create table outputTbl1(key string, values bigint) stored as rcfile; + +load data local inpath '../data/files/T1.txt' into table inputTbl1; + +explain +insert overwrite table outputTbl1 +SELECT * FROM +( +select key, 1 as values from inputTbl1 +union all +select a.key as key, b.val as values +FROM inputTbl1 a join inputTbl1 b on a.key=b.key +)c; + +insert overwrite table outputTbl1 +SELECT * FROM +( +select key, 1 as values from inputTbl1 +union all +select a.key as key, b.val as values +FROM inputTbl1 a join inputTbl1 b on a.key=b.key +)c; + +desc formatted outputTbl1; + +select * from outputTbl1; Index: ql/src/test/queries/clientpositive/union_remove_9.q =================================================================== --- ql/src/test/queries/clientpositive/union_remove_9.q (revision 0) +++ ql/src/test/queries/clientpositive/union_remove_9.q (working copy) @@ -0,0 +1,51 @@ +set hive.optimize.union.selectstar=true; + +set hive.merge.mapfiles=true; +set hive.merge.mapredfiles=true; +set hive.merge.smallfiles.avgsize=1; + +-- This is to test the union->selectstar->filesink optimization +-- Union of 2 subqueries is performed (one of which contains a union and is map-only), +-- and the other one is a map-reduce query followed by select star and a file sink. +-- There is no need for the outer union. +-- The final file format is different from the input and intermediate file format. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- on + +-- There is a issue with merge for hadoop 20. +-- Till tests start working completely for hadoop 23, making sure that the merge is not +-- triggered, by setting small files average size artificially low + +create table inputTbl1(key string, val string) stored as textfile; +create table outputTbl1(key string, values bigint) stored as rcfile; + +load data local inpath '../data/files/T1.txt' into table inputTbl1; + +explain +insert overwrite table outputTbl1 +SELECT * FROM +( +select key, count(1) as values from inputTbl1 group by key +union all +select * FROM ( + SELECT key, 1 as values from inputTbl1 + UNION ALL + SELECT key, 2 as values from inputTbl1 +) a +)b; + +insert overwrite table outputTbl1 +SELECT * FROM +( +select key, count(1) as values from inputTbl1 group by key +union all +select * FROM ( + SELECT key, 1 as values from inputTbl1 + UNION ALL + SELECT key, 2 as values from inputTbl1 +) a +)b; + +desc formatted outputTbl1; + +select * from outputTbl1; Index: ql/src/test/queries/clientpositive/union_remove_16.q =================================================================== --- ql/src/test/queries/clientpositive/union_remove_16.q (revision 0) +++ ql/src/test/queries/clientpositive/union_remove_16.q (working copy) @@ -0,0 +1,52 @@ +set hive.optimize.union.selectstar=true; + +set hive.merge.mapfiles=true; +set hive.merge.mapredfiles=true; +set hive.merge.smallfiles.avgsize=1; + +set hive.exec.dynamic.partition.mode=nonstrict; +set hive.exec.dynamic.partition=true; + +-- This is to test the union->selectstar->filesink optimization +-- Union of 2 map-reduce subqueries is performed followed by select star and a file sink +-- and the results are written to a table using dynamic partitions. +-- There is no need to write the temporary results of the sub-queries, and then read them +-- again to process the union. The union can be removed completely. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- on +-- Curently, this optimization does not work in the presence of dynamic partitions. + +create table inputTbl1(key string, val string) stored as textfile; +create table outputTbl1(key string, values bigint) partitioned by (ds string) stored as rcfile ; + +load data local inpath '../data/files/T1.txt' into table inputTbl1; + +explain +insert overwrite table outputTbl1 partition (ds) +SELECT * +FROM ( + SELECT key, count(1) as values, '1' as ds from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values, '2' as ds from inputTbl1 group by key +) a; + +insert overwrite table outputTbl1 partition (ds) +SELECT * +FROM ( + SELECT key, count(1) as values, '1' as ds from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values, '2' as ds from inputTbl1 group by key +) a; + +desc formatted outputTbl1; +show partitions outputTbl1; + +-- Ideally, there should be a query to execute a map-reduce job on outputTbl1. +-- something like, select count(1) from outputTbl1 +-- However, that requires the following +-- MAPREDUCE-1501 with mapred.input.dir.recursive set to true +-- This is not present in hadoop 20. +-- However, when HIVE-3344 is fixed, this test can only be run on hadoop 23 + +select * from outputTbl1 where ds = '1'; +select * from outputTbl1 where ds = '2'; Index: ql/src/test/queries/clientpositive/union_remove_2.q =================================================================== --- ql/src/test/queries/clientpositive/union_remove_2.q (revision 0) +++ ql/src/test/queries/clientpositive/union_remove_2.q (working copy) @@ -0,0 +1,49 @@ +set hive.optimize.union.selectstar=true; + +set hive.merge.mapfiles=false; +set hive.merge.mapredfiles=false; + +-- This is to test the union->selectstar->filesink optimization +-- Union of 3 subqueries is performed (exactly one of which requires a map-reduce job) +-- followed by select star and a file sink. +-- There is no need to write the temporary results of the sub-queries, and then read them +-- again to process the union. The union can be removed completely. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- off + +create table inputTbl1(key string, val string) stored as textfile; +create table outputTbl1(key string, values bigint) stored as textfile; + +load data local inpath '../data/files/T1.txt' into table inputTbl1; + +explain +insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, 1 as values from inputTbl1 + UNION ALL + SELECT key, 2 as values from inputTbl1 +) a; + +insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, 1 as values from inputTbl1 + UNION ALL + SELECT key, 2 as values from inputTbl1 +) a; + +desc formatted outputTbl1; + +-- Ideally, there should be a query to execute a map-reduce job on outputTbl1. +-- something like, select count(1) from outputTbl1 +-- However, that requires the following +-- MAPREDUCE-1501 with mapred.input.dir.recursive set to true +-- This is not present in hadoop 20. +-- However, when HIVE-3344 is fixed, this test can only be run on hadoop 23 + +select * from outputTbl1; Index: ql/src/test/queries/clientpositive/union_remove_4.q =================================================================== --- ql/src/test/queries/clientpositive/union_remove_4.q (revision 0) +++ ql/src/test/queries/clientpositive/union_remove_4.q (working copy) @@ -0,0 +1,42 @@ +set hive.optimize.union.selectstar=true; + +set hive.merge.mapfiles=true; +set hive.merge.mapredfiles=true; +set hive.merge.smallfiles.avgsize=1; + +-- This is to test the union->selectstar->filesink optimization +-- Union of 2 map-reduce subqueries is performed followed by select star and a file sink +-- There is no need to write the temporary results of the sub-queries, and then read them +-- again to process the union. The union can be removed completely. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- on + +-- There is a issue with merge for hadoop 20. +-- Till tests start working completely for hadoop 23, making sure that the merge is not +-- triggered, by setting small files average size artificially low + +create table inputTbl1(key string, val string) stored as textfile; +create table outputTbl1(key string, values bigint) stored as textfile; + +load data local inpath '../data/files/T1.txt' into table inputTbl1; + +explain +insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values from inputTbl1 group by key +) a; + +insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values from inputTbl1 group by key +) a; + +desc formatted outputTbl1; + +select * from outputTbl1; Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/unionproc/UnionProcFactory.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/unionproc/UnionProcFactory.java (revision 1372257) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/unionproc/UnionProcFactory.java (working copy) @@ -18,16 +18,23 @@ package org.apache.hadoop.hive.ql.optimizer.unionproc; import java.io.Serializable; +import java.util.ArrayList; import java.util.List; import java.util.Stack; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.ql.exec.FileSinkOperator; import org.apache.hadoop.hive.ql.exec.Operator; +import org.apache.hadoop.hive.ql.exec.OperatorFactory; +import org.apache.hadoop.hive.ql.exec.SelectOperator; import org.apache.hadoop.hive.ql.exec.UnionOperator; import org.apache.hadoop.hive.ql.lib.Node; import org.apache.hadoop.hive.ql.lib.NodeProcessor; import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx; import org.apache.hadoop.hive.ql.optimizer.unionproc.UnionProcContext.UnionParseContext; import org.apache.hadoop.hive.ql.parse.SemanticException; +import org.apache.hadoop.hive.ql.plan.FileSinkDesc; +import org.apache.hadoop.hive.ql.plan.SelectDesc; /** * Operator factory for union processing. @@ -173,6 +180,92 @@ } /** + * Union followed by no processing. + * This is to optimize queries of the type: + * select * from (subq1 union all subq2 ...)x; + * where at least one of the queries involve a map-reduce job. + * There is no need for a union in this scenario - it involves an extra + * write and read for the final output without this optimization. + */ + public static class UnionNoProcessFile implements NodeProcessor { + + @Override + public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, + Object... nodeOutputs) throws SemanticException { + FileSinkOperator fileSinkOp = (FileSinkOperator)nd; + + // Has this filesink already been processed + if (fileSinkOp.getConf().isLinkedFileSink()) { + return null; + } + + int size = stack.size(); + SelectOperator select = (SelectOperator)stack.get(size-2); + + // If it is not a select star, dont do anything + SelectDesc selectDesc = select.getConf(); + if (!selectDesc.isSelectStar()) { + return null; + } + + UnionOperator union = (UnionOperator)stack.get(size-3); + + // No need for this optimization in case of multi-table inserts + if (union.getChildOperators().size() > 1) { + return null; + } + + UnionProcContext ctx = (UnionProcContext) procCtx; + UnionParseContext uCtx = ctx.getUnionParseContext(union); + + // No need for this if all sub-queries are map-only queries + if (uCtx.allMapOnlySubQ()) { + return null; + } + + // Dont do anything for dynamic partitions + if (fileSinkOp.getConf().getDynPartCtx() != null) { + return null; + } + + String parentDirName = fileSinkOp.getConf().getDirName(); + + // Clone the fileSinkDesc of the final fileSink and create similar fileSinks at + // each parent + List fileDescLists = new ArrayList(); + + for (Operator parent : union.getParentOperators()) { + FileSinkDesc fileSinkDesc = null; + try { + fileSinkDesc = (FileSinkDesc) fileSinkOp.getConf().clone(); + } catch (CloneNotSupportedException e) { + // TODO: throw an exception + return null; + } + + String dirName = parentDirName + Path.SEPARATOR + parent.getIdentifier() ; + fileSinkDesc.setDirName(dirName); + fileSinkDesc.setLinkedFileSink(true); + fileSinkDesc.setParentDir(parentDirName); + // If all the sub-queries were map-only, there was no use for this optimization + parent.setChildOperators(null); + Operator tmpFileSinkOp = + OperatorFactory.getAndMakeChild(fileSinkDesc, parent); + tmpFileSinkOp.setChildOperators(null); + fileDescLists.add(fileSinkDesc); + } + + for (FileSinkDesc fileDesc : fileDescLists) { + fileDesc.setLinkedFileSinkDesc(fileDescLists); + } + + // Delink the union as if it was not present in the first place. + union.setParentOperators(null); + return null; + } + } + + /** * Default processor. */ public static class NoUnion implements NodeProcessor { @@ -204,4 +297,7 @@ return new NoUnion(); } + public static NodeProcessor getUnionNoProcessFile() { + return new UnionNoProcessFile(); + } } Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/unionproc/UnionProcessor.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/unionproc/UnionProcessor.java (revision 1372257) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/unionproc/UnionProcessor.java (working copy) @@ -22,6 +22,7 @@ import java.util.LinkedHashMap; import java.util.Map; +import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher; import org.apache.hadoop.hive.ql.lib.Dispatcher; import org.apache.hadoop.hive.ql.lib.GraphWalker; @@ -64,13 +65,18 @@ // the operator stack. Map opRules = new LinkedHashMap(); opRules.put(new RuleRegExp("R1", "RS%.*UNION%"), - UnionProcFactory.getMapRedUnion()); + UnionProcFactory.getMapRedUnion()); opRules.put(new RuleRegExp("R2", "UNION%.*UNION%"), - UnionProcFactory.getUnknownUnion()); + UnionProcFactory.getUnknownUnion()); opRules.put(new RuleRegExp("R3", "TS%.*UNION%"), - UnionProcFactory.getMapUnion()); - opRules.put(new RuleRegExp("R3", "MAPJOIN%.*UNION%"), - UnionProcFactory.getMapJoinUnion()); + UnionProcFactory.getMapUnion()); + opRules.put(new RuleRegExp("R4", "MAPJOIN%.*UNION%"), + UnionProcFactory.getMapJoinUnion()); + HiveConf conf = pCtx.getConf(); + if (conf.getBoolVar(HiveConf.ConfVars.HIVEOPTIMIZEUNIONSELECTSTAR)) { + opRules.put(new RuleRegExp("R5", "UNION%SEL%FS%"), + UnionProcFactory.getUnionNoProcessFile()); + } // The dispatcher fires the processor for the matching rule and passes the // context along Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRProcContext.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRProcContext.java (revision 1372257) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRProcContext.java (working copy) @@ -23,6 +23,7 @@ import java.util.HashMap; import java.util.LinkedHashMap; import java.util.List; +import java.util.Map; import java.util.Set; import org.apache.hadoop.hive.conf.HiveConf; @@ -38,6 +39,7 @@ import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx; import org.apache.hadoop.hive.ql.parse.ParseContext; import org.apache.hadoop.hive.ql.plan.DependencyCollectionWork; +import org.apache.hadoop.hive.ql.plan.FileSinkDesc; import org.apache.hadoop.hive.ql.plan.MapJoinDesc; import org.apache.hadoop.hive.ql.plan.MoveWork; import org.apache.hadoop.hive.ql.plan.TableDesc; @@ -251,6 +253,10 @@ private List> rootOps; private DependencyCollectionTask dependencyTaskForMultiInsert; + // If many fileSinkDescs are linked to each other, it is a good idea to keep track of + // tasks for first fileSinkDesc. others can use it + private Map> linkedFileDescTasks; + /** * Set of read entities. This list is generated by the walker and is passed to * the hooks. @@ -312,6 +318,7 @@ unionTaskMap = new HashMap(); mapJoinTaskMap = new HashMap, GenMRMapJoinCtx>(); dependencyTaskForMultiInsert = null; + linkedFileDescTasks = null; } /** @@ -565,4 +572,13 @@ } return dependencyTaskForMultiInsert; } + + public Map> getLinkedFileDescTasks() { + return linkedFileDescTasks; + } + + public void setLinkedFileDescTasks( + Map> linkedFileDescTasks) { + this.linkedFileDescTasks = linkedFileDescTasks; + } } Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRFileSink1.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRFileSink1.java (revision 1372257) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMRFileSink1.java (working copy) @@ -23,6 +23,7 @@ import java.util.HashMap; import java.util.LinkedHashMap; import java.util.List; +import java.util.Map; import java.util.Stack; import org.apache.commons.logging.Log; @@ -108,6 +109,16 @@ parseCtx.getQB().getParseInfo().isInsertToTable(); HiveConf hconf = parseCtx.getConf(); + // If this file sink desc has been processed due to a linked file sink desc, + // use that task + Map> fileSinkDescs = ctx.getLinkedFileDescTasks(); + if (fileSinkDescs != null) { + Task childTask = fileSinkDescs.get(fsOp.getConf()); + if (childTask != null) { + processLinkedFileDesc(ctx, childTask); + return null; + } + } // Has the user enabled merging of files for map-only jobs or for all jobs if ((ctx.getMvTask() != null) && (!ctx.getMvTask().isEmpty())) { @@ -128,19 +139,27 @@ } if ((mvTask != null) && !mvTask.isLocal()) { - // There are separate configuration parameters to control whether to - // merge for a map-only job - // or for a map-reduce job - MapredWork currWork = (MapredWork) currTask.getWork(); - boolean mergeMapOnly = - hconf.getBoolVar(HiveConf.ConfVars.HIVEMERGEMAPFILES) && - currWork.getReducer() == null; - boolean mergeMapRed = - hconf.getBoolVar(HiveConf.ConfVars.HIVEMERGEMAPREDFILES) && - currWork.getReducer() != null; - if (mergeMapOnly || mergeMapRed) { - chDir = true; + if (fsOp.getConf().isLinkedFileSink()) { + if (hconf.getBoolVar(HiveConf.ConfVars.HIVEMERGEMAPFILES) || + hconf.getBoolVar(HiveConf.ConfVars.HIVEMERGEMAPREDFILES)) { + chDir = true; + } } + else { + // There are separate configuration parameters to control whether to + // merge for a map-only job + // or for a map-reduce job + MapredWork currWork = (MapredWork) currTask.getWork(); + boolean mergeMapOnly = + hconf.getBoolVar(HiveConf.ConfVars.HIVEMERGEMAPFILES) && + currWork.getReducer() == null; + boolean mergeMapRed = + hconf.getBoolVar(HiveConf.ConfVars.HIVEMERGEMAPREDFILES) && + currWork.getReducer() != null; + if (mergeMapOnly || mergeMapRed) { + chDir = true; + } + } } } } @@ -152,9 +171,54 @@ createMergeJob((FileSinkOperator) nd, ctx, finalName); } + FileSinkDesc fileSinkDesc = fsOp.getConf(); + if (fileSinkDesc.isLinkedFileSink()) { + Map> linkedFileDescTasks = + ctx.getLinkedFileDescTasks(); + if (linkedFileDescTasks == null) { + linkedFileDescTasks = new HashMap>(); + ctx.setLinkedFileDescTasks(linkedFileDescTasks); + } + assert currTask.getChildTasks() != null; + assert currTask.getChildTasks().size() == 1; + + for (FileSinkDesc fileDesc : fileSinkDesc.getLinkedFileSinkDesc()) { + linkedFileDescTasks.put(fileDesc, currTask.getChildTasks().get(0)); + } + + } + return null; } + /* + * Multiple file sink descriptors are linked. + * Use the task created by the first linked file descriptor + */ + private void processLinkedFileDesc(GenMRProcContext ctx, + Task childTask) + throws SemanticException { + Operator currTopOp = ctx.getCurrTopOp(); + String currAliasId = ctx.getCurrAliasId(); + List> seenOps = ctx.getSeenOps(); + List> rootTasks = ctx.getRootTasks(); + Task currTask = ctx.getCurrTask(); + + if (currTopOp != null) { + if (!seenOps.contains(currTopOp)) { + seenOps.add(currTopOp); + GenMapRedUtils.setTaskPlan(currAliasId, currTopOp, + (MapredWork) currTask.getWork(), false, ctx); + } + + if (!rootTasks.contains(currTask)) { + rootTasks.add(currTask); + } + } + + currTask.addDependentTask(childTask); + } + /** * Add the StatsTask as a dependent task of the MoveTask * because StatsTask will change the Table/Partition metadata. For atomicity, we @@ -261,10 +325,10 @@ // since it is unknown if the merge MR will be triggered at execution time. MoveWork dummyMv = new MoveWork(null, null, null, - new LoadFileDesc(fsConf.getDirName(), finalName, true, null, null), false); + new LoadFileDesc(fsConf.getFinalDirName(), finalName, true, null, null), false); ConditionalTask cndTsk = createCondTask(conf, currTask, dummyMv, cplan, - fsConf.getDirName()); + fsConf.getFinalDirName()); linkMoveTask(ctx, newOutput, cndTsk); } @@ -385,7 +449,7 @@ // MapRedTask currTask = (MapRedTask) ctx.getCurrTask(); MoveWork dummyMv = new MoveWork(null, null, null, - new LoadFileDesc(fsInputDesc.getDirName(), finalName, true, null, null), false); + new LoadFileDesc(fsInputDesc.getFinalDirName(), finalName, true, null, null), false); MapredWork cplan; if(parseCtx.getConf().getBoolVar(HiveConf.ConfVars. @@ -415,7 +479,7 @@ // NOTE: we should gather stats in MR1 rather than MR2 at merge job since we don't // know if merge MR2 will be triggered at execution time ConditionalTask cndTsk = createCondTask(ctx.getConf(), ctx.getCurrTask(), dummyMv, cplan, - fsInputDesc.getDirName()); + fsInputDesc.getFinalDirName()); // keep the dynamic partition context in conditional task resolver context ConditionalResolverMergeFilesCtx mrCtx = @@ -514,7 +578,7 @@ FileSinkDesc fsDesc) { ArrayList aliases = new ArrayList(); - String inputDir = fsDesc.getDirName(); + String inputDir = fsDesc.getFinalDirName(); TableDesc tblDesc = fsDesc.getTableInfo(); aliases.add(inputDir); // dummy alias: just use the input path @@ -539,7 +603,7 @@ private MapredWork createRCFileMergeTask(FileSinkDesc fsInputDesc, String finalName, boolean hasDynamicPartitions) throws SemanticException { - String inputDir = fsInputDesc.getDirName(); + String inputDir = fsInputDesc.getFinalDirName(); TableDesc tblDesc = fsInputDesc.getTableInfo(); if(tblDesc.getInputFileFormatClass().equals(RCFileInputFormat.class)) { @@ -635,8 +699,9 @@ srcDir = mvWork.getLoadTableWork().getSourceDir(); } + String fsOpDirName = fsOp.getConf().getFinalDirName(); if ((srcDir != null) - && (srcDir.equalsIgnoreCase(fsOp.getConf().getDirName()))) { + && (srcDir.equalsIgnoreCase(fsOpDirName))) { return mvTsk; } } @@ -679,7 +744,7 @@ String dest = null; if (chDir) { - dest = fsOp.getConf().getDirName(); + dest = fsOp.getConf().getFinalDirName(); // generate the temporary file // it must be on the same file system as the current destination @@ -687,7 +752,17 @@ Context baseCtx = parseCtx.getContext(); String tmpDir = baseCtx.getExternalTmpFileURI((new Path(dest)).toUri()); - fsOp.getConf().setDirName(tmpDir); + FileSinkDesc fileSinkDesc = fsOp.getConf(); + // Change all the linked file sink descriptors + if (fileSinkDesc.isLinkedFileSink()) { + for (FileSinkDesc fsConf:fileSinkDesc.getLinkedFileSinkDesc()) { + String fileName = Utilities.getFileNameFromDirName(fsConf.getDirName()); + fsConf.setParentDir(tmpDir); + fsConf.setDirName(tmpDir + Path.SEPARATOR + fileName); + } + } else { + fileSinkDesc.setDirName(tmpDir); + } } Task mvTask = null; @@ -705,7 +780,6 @@ // Set the move task to be dependent on the current task if (mvTask != null) { - addDependentMoveTasks(ctx, mvTask, currTask); } @@ -740,7 +814,7 @@ } } // mapTask and currTask should be merged by and join/union operator - // (e.g., GenMRUnion1j) which has multiple topOps. + // (e.g., GenMRUnion1) which has multiple topOps. // assert mapTask == currTask : "mapTask.id = " + mapTask.getId() // + "; currTask.id = " + currTask.getId(); } Index: ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java (revision 1372257) +++ ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java (working copy) @@ -31,13 +31,13 @@ import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; +import java.util.HashSet; import java.util.Iterator; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Set; -import java.util.HashSet; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -63,7 +63,6 @@ import org.apache.hadoop.hive.metastore.api.HiveObjectRef; import org.apache.hadoop.hive.metastore.api.HiveObjectType; import org.apache.hadoop.hive.metastore.api.Index; -import org.apache.hadoop.hive.metastore.api.InvalidObjectException; import org.apache.hadoop.hive.metastore.api.InvalidOperationException; import org.apache.hadoop.hive.metastore.api.MetaException; import org.apache.hadoop.hive.metastore.api.NoSuchObjectException; @@ -1868,10 +1867,7 @@ fs.delete(itemStaging, true); continue; } - if (item.isDir()) { - throw new HiveException("checkPaths: " + src.getPath() - + " has nested directory" + itemStaging); - } + if (!replace) { // It's possible that the file we're copying may have the same // relative name as an existing file in the "destf" directory. Index: ql/src/java/org/apache/hadoop/hive/ql/exec/FetchOperator.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/exec/FetchOperator.java (revision 1372257) +++ ql/src/java/org/apache/hadoop/hive/ql/exec/FetchOperator.java (working copy) @@ -23,6 +23,7 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; +import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; @@ -70,6 +71,7 @@ private PartitionDesc currPart; private TableDesc currTbl; private boolean tblDataDone; + private HashSet doneTables; private transient RecordReader currRecReader; private transient InputSplit[] inputSplits; @@ -199,31 +201,64 @@ return partValues; } - private void getNextPath() throws Exception { - // first time - if (iterPath == null) { - if (work.getTblDir() != null) { + + /** + * Update currPath with the appropriate path to the table. If the given TblDirPath is + * non-empty, that is the currPath. Otherwise, we take the non-empty children of TblDirPath + */ + private void getTableInfo() throws Exception { + currPath = work.getTblDirPath(); + currTbl = work.getTblDesc(); + if (isNativeTable) { + if (doneTables == null) { + doneTables = new HashSet(); + } + FileSystem fs = currPath.getFileSystem(job); + if (fs.exists(currPath)) { + FileStatus[] fStats = listStatusUnderPath(fs, currPath); + for (FileStatus fStat : fStats) { + if (!fStat.isDir() && fStat.getLen() > 0 && !doneTables.contains(currPath)) { + tblDataDone = true; + doneTables.add(currPath); + break; + } + } + if (!tblDataDone) { - currPath = work.getTblDirPath(); - currTbl = work.getTblDesc(); - if (isNativeTable) { - FileSystem fs = currPath.getFileSystem(job); - if (fs.exists(currPath)) { - FileStatus[] fStats = listStatusUnderPath(fs, currPath); - for (FileStatus fStat : fStats) { - if (fStat.getLen() > 0) { - tblDataDone = true; - break; + for (FileStatus fStat : fStats) { + if (fStat.isDir() && !doneTables.contains(fStat.getPath())) { + boolean nonEmpty = false; + for (FileStatus fStatChild : listStatusUnderPath(fs, fStat.getPath())) { + if (!fStatChild.isDir() && fStatChild.getLen() > 0) { + nonEmpty = true; } } + if (nonEmpty) { + currPath = fStat.getPath(); + doneTables.add(fStat.getPath()); + tblDataDone = true; + break; + } } - } else { - tblDataDone = true; } + } + } + } else { + tblDataDone = true; + } - if (!tblDataDone) { - currPath = null; - } + if (!tblDataDone) { + currPath = null; + } + } + + private void getNextPath() throws Exception { + // first time + if (iterPath == null) { + if (work.getTblDir() != null) { + if (!tblDataDone) { + getTableInfo(); + tblDataDone = false; return; } else { currTbl = null; Index: ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java (revision 1372257) +++ ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java (working copy) @@ -1206,6 +1206,14 @@ return taskId; } + public static String getFileNameFromDirName(String dirName) { + int dirEnd = dirName.lastIndexOf(Path.SEPARATOR); + if (dirEnd != -1) { + return dirName.substring(dirEnd + 1); + } + return dirName; + } + /** * Replace the task id from the filename. It is assumed that the filename is derived from the * output of getTaskId Index: ql/src/java/org/apache/hadoop/hive/ql/plan/FileSinkDesc.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/plan/FileSinkDesc.java (revision 1372257) +++ ql/src/java/org/apache/hadoop/hive/ql/plan/FileSinkDesc.java (working copy) @@ -20,6 +20,7 @@ import java.io.Serializable; import java.util.ArrayList; +import java.util.List; import org.apache.hadoop.fs.Path; @@ -28,7 +29,7 @@ * */ @Explain(displayName = "File Output Operator") -public class FileSinkDesc implements Serializable { +public class FileSinkDesc implements Serializable, Cloneable { private static final long serialVersionUID = 1L; private String dirName; // normally statsKeyPref will be the same as dirName, but the latter @@ -46,6 +47,19 @@ private DynamicPartitionCtx dpCtx; private String staticSpec; // static partition spec ends with a '/' private boolean gatherStats; + + // This file descriptor is linked to other file descriptors. + // One use case is that, a union->select (star)->file sink, is broken down. + // For eg: consider a query like: + // select * from (subq1 union all subq2)x; + // where subq1 or subq2 involves a map-reduce job. + // It is broken into two independent queries involving subq1 and subq2 directly, and + // the sub-queries write to sub-directories of a common directory. So, the file sink + // descriptors for subq1 and subq2 are linked. + transient private boolean linkedFileSink = false; + transient private String parentDir; + transient private List linkedFileSinkDesc; + private boolean statsReliable; public FileSinkDesc() { @@ -80,6 +94,19 @@ this.partitionCols = null; } + @Override + public Object clone() throws CloneNotSupportedException { + FileSinkDesc ret = new FileSinkDesc(dirName, tableInfo, compressed, + destTableId, multiFileSpray, numFiles, totalFiles, + partitionCols, dpCtx); + ret.setCompressCodec(compressCodec); + ret.setCompressType(compressType); + ret.setGatherStats(gatherStats); + ret.setStaticSpec(staticSpec); + ret.setStatsAggPrefix(statsKeyPref); + return (Object) ret; + } + @Explain(displayName = "directory", normalExplain = false) public String getDirName() { return dirName; @@ -89,6 +116,10 @@ this.dirName = dirName; } + public String getFinalDirName() { + return linkedFileSink ? parentDir : dirName; + } + @Explain(displayName = "table") public TableDesc getTableInfo() { return tableInfo; @@ -249,6 +280,22 @@ } } + public boolean isLinkedFileSink() { + return linkedFileSink; + } + + public void setLinkedFileSink(boolean linkedFileSink) { + this.linkedFileSink = linkedFileSink; + } + + public String getParentDir() { + return parentDir; + } + + public void setParentDir(String parentDir) { + this.parentDir = parentDir; + } + public boolean isStatsReliable() { return statsReliable; } @@ -256,4 +303,12 @@ public void setStatsReliable(boolean statsReliable) { this.statsReliable = statsReliable; } + + public List getLinkedFileSinkDesc() { + return linkedFileSinkDesc; + } + + public void setLinkedFileSinkDesc(List linkedFileSinkDesc) { + this.linkedFileSinkDesc = linkedFileSinkDesc; + } } Index: ql/src/java/org/apache/hadoop/hive/ql/plan/ConditionalResolverMergeFiles.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/plan/ConditionalResolverMergeFiles.java (revision 1372257) +++ ql/src/java/org/apache/hadoop/hive/ql/plan/ConditionalResolverMergeFiles.java (working copy) @@ -258,6 +258,53 @@ work.setMinSplitSizePerRack(targetSize); } + private static class AverageSize { + private final long totalSize; + private final int numFiles; + + public AverageSize(long totalSize, int numFiles) { + this.totalSize = totalSize; + this.numFiles = numFiles; + } + + public long getTotalSize() { + return totalSize; + } + + public int getNumFiles() { + return numFiles; + } + } + + private AverageSize getAverageSize(FileSystem inpFs, Path dirPath) { + AverageSize dummy = new AverageSize(0, 0); + AverageSize error = new AverageSize(-1, -1); + try { + FileStatus[] fStats = inpFs.listStatus(dirPath); + + long totalSz = 0; + int numFiles = 0; + for (FileStatus fStat : fStats) { + if (fStat.isDir()) { + AverageSize avgSzDir = getAverageSize(inpFs, fStat.getPath()); + if (avgSzDir.getTotalSize() < 0) { + return error; + } + totalSz += avgSzDir.getTotalSize(); + numFiles += avgSzDir.getNumFiles(); + } + else { + totalSz += fStat.getLen(); + numFiles++; + } + } + + return new AverageSize(totalSz, numFiles); + } catch (IOException e) { + return error; + } + } + /** * Whether to merge files inside directory given the threshold of the average file size. * @@ -270,23 +317,18 @@ * This could be true when the table is bucketized and all buckets are empty. */ private long getMergeSize(FileSystem inpFs, Path dirPath, long avgSize) { - try { - FileStatus[] fStats = inpFs.listStatus(dirPath); - if (fStats.length <= 1) { - return -1; - } - long totalSz = 0; - for (FileStatus fStat : fStats) { - totalSz += fStat.getLen(); - } + AverageSize averageSize = getAverageSize(inpFs, dirPath); + if (averageSize.getTotalSize() <= 0) { + return -1; + } - if (totalSz < avgSize * fStats.length) { - return totalSz; - } else { - return -1; - } - } catch (IOException e) { + if (averageSize.getNumFiles() <= 1) { return -1; } + + if (averageSize.getTotalSize()/averageSize.getNumFiles() < avgSize) { + return averageSize.getTotalSize(); + } + return -1; } }