From 75e337672c2c53d089b4099815793d76df7f04f4 Mon Sep 17 00:00:00 2001 From: Na Yang Date: Tue, 19 Aug 2014 18:32:43 -0700 Subject: [PATCH] HIVE-7767: hive.optimize.union.remove does not work properly [Spark Branch] --- .../test/resources/testconfiguration.properties | 20 + .../hadoop/hive/ql/exec/spark/GraphTran.java | 9 +- .../clientpositive/spark/union_remove_1.q.out | 247 ++++++++ .../clientpositive/spark/union_remove_10.q.out | 327 ++++++++++ .../clientpositive/spark/union_remove_11.q.out | 255 ++++++++ .../clientpositive/spark/union_remove_15.q.out | 278 +++++++++ .../clientpositive/spark/union_remove_16.q.out | 306 ++++++++++ .../clientpositive/spark/union_remove_17.q.out | 236 ++++++++ .../clientpositive/spark/union_remove_18.q.out | 318 ++++++++++ .../clientpositive/spark/union_remove_19.q.out | 603 +++++++++++++++++++ .../clientpositive/spark/union_remove_20.q.out | 249 ++++++++ .../clientpositive/spark/union_remove_21.q.out | 247 ++++++++ .../clientpositive/spark/union_remove_22.q.out | 417 +++++++++++++ .../clientpositive/spark/union_remove_24.q.out | 245 ++++++++ .../clientpositive/spark/union_remove_25.q.out | 666 +++++++++++++++++++++ .../clientpositive/spark/union_remove_3.q.out | 235 ++++++++ .../clientpositive/spark/union_remove_4.q.out | 297 +++++++++ .../clientpositive/spark/union_remove_5.q.out | 308 ++++++++++ .../clientpositive/spark/union_remove_6.q.out | 269 +++++++++ .../clientpositive/spark/union_remove_7.q.out | 251 ++++++++ .../clientpositive/spark/union_remove_8.q.out | 260 ++++++++ 21 files changed, 6040 insertions(+), 3 deletions(-) create mode 100644 ql/src/test/results/clientpositive/spark/union_remove_1.q.out create mode 100644 ql/src/test/results/clientpositive/spark/union_remove_10.q.out create mode 100644 ql/src/test/results/clientpositive/spark/union_remove_11.q.out create mode 100644 ql/src/test/results/clientpositive/spark/union_remove_15.q.out create mode 100644 ql/src/test/results/clientpositive/spark/union_remove_16.q.out create mode 100644 ql/src/test/results/clientpositive/spark/union_remove_17.q.out create mode 100644 ql/src/test/results/clientpositive/spark/union_remove_18.q.out create mode 100644 ql/src/test/results/clientpositive/spark/union_remove_19.q.out create mode 100644 ql/src/test/results/clientpositive/spark/union_remove_20.q.out create mode 100644 ql/src/test/results/clientpositive/spark/union_remove_21.q.out create mode 100644 ql/src/test/results/clientpositive/spark/union_remove_22.q.out create mode 100644 ql/src/test/results/clientpositive/spark/union_remove_24.q.out create mode 100644 ql/src/test/results/clientpositive/spark/union_remove_25.q.out create mode 100644 ql/src/test/results/clientpositive/spark/union_remove_3.q.out create mode 100644 ql/src/test/results/clientpositive/spark/union_remove_4.q.out create mode 100644 ql/src/test/results/clientpositive/spark/union_remove_5.q.out create mode 100644 ql/src/test/results/clientpositive/spark/union_remove_6.q.out create mode 100644 ql/src/test/results/clientpositive/spark/union_remove_7.q.out create mode 100644 ql/src/test/results/clientpositive/spark/union_remove_8.q.out diff --git itests/src/test/resources/testconfiguration.properties itests/src/test/resources/testconfiguration.properties index ecb8b74..f598524 100644 --- itests/src/test/resources/testconfiguration.properties +++ itests/src/test/resources/testconfiguration.properties @@ -333,6 +333,26 @@ spark.query.files=spark_test.q \ timestamp_lazy.q \ timestamp_null.q \ timestamp_udf.q \ + union_remove_1.q \ + union_remove_10.q \ + union_remove_11.q \ + union_remove_15.q \ + union_remove_16.q \ + union_remove_17.q \ + union_remove_18.q \ + union_remove_19.q \ + union_remove_2.q \ + union_remove_20.q \ + union_remove_21.q \ + union_remove_22.q \ + union_remove_24.q \ + union_remove_25.q \ + union_remove_3.q \ + union_remove_4.q \ + union_remove_5.q \ + union_remove_6.q \ + union_remove_7.q \ + union_remove_8.q \ union_null.q \ union_ppr.q \ union.q \ diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/spark/GraphTran.java ql/src/java/org/apache/hadoop/hive/ql/exec/spark/GraphTran.java index 8a3422e..03f0ff8 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/spark/GraphTran.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/spark/GraphTran.java @@ -55,7 +55,7 @@ public void addTranWithInput(SparkTran tran, } public void execute() throws Exception { - JavaPairRDD resultRDD = null; + Map> resultRDDs = new HashMap>(); for (SparkTran tran : rootTrans) { // make sure all the root trans are MapTran if (!(tran instanceof MapTran)) { @@ -94,9 +94,12 @@ public void execute() throws Exception { } tran = childTran; } - resultRDD = rdd; + // if the current transformation is a leaf tran and it has not got processed yet, cache its corresponding RDD + if (!resultRDDs.containsKey(tran) && getChildren(tran).isEmpty()) { + resultRDDs.put(tran, rdd); + } } - if (resultRDD != null) { + for (JavaPairRDD resultRDD : resultRDDs.values()) { resultRDD.foreach(HiveVoidFunction.getInstance()); } } diff --git ql/src/test/results/clientpositive/spark/union_remove_1.q.out ql/src/test/results/clientpositive/spark/union_remove_1.q.out new file mode 100644 index 0000000..c07fbb1 --- /dev/null +++ ql/src/test/results/clientpositive/spark/union_remove_1.q.out @@ -0,0 +1,247 @@ +PREHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 2 map-reduce subqueries is performed followed by select star and a file sink +-- There is no need to write the temporary results of the sub-queries, and then read them +-- again to process the union. The union can be removed completely. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- off +-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23) +-- Since this test creates sub-directories for the output table outputTbl1, it might be easier +-- to run the test only on hadoop 23 + +create table inputTbl1(key string, val string) stored as textfile +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@inputTbl1 +POSTHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 2 map-reduce subqueries is performed followed by select star and a file sink +-- There is no need to write the temporary results of the sub-queries, and then read them +-- again to process the union. The union can be removed completely. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- off +-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23) +-- Since this test creates sub-directories for the output table outputTbl1, it might be easier +-- to run the test only on hadoop 23 + +create table inputTbl1(key string, val string) stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@inputTbl1 +PREHOOK: query: create table outputTbl1(key string, values bigint) stored as textfile +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@outputTbl1 +POSTHOOK: query: create table outputTbl1(key string, values bigint) stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@outputTbl1 +PREHOOK: query: load data local inpath '../../data/files/T1.txt' into table inputTbl1 +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@inputtbl1 +POSTHOOK: query: load data local inpath '../../data/files/T1.txt' into table inputTbl1 +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@inputtbl1 +PREHOOK: query: explain +insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values from inputTbl1 group by key +) a +PREHOOK: type: QUERY +POSTHOOK: query: explain +insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values from inputTbl1 group by key +) a +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + +STAGE PLANS: + Stage: Stage-1 + Spark + Edges: + Reducer 2 <- Map 1 (GROUP) + Reducer 4 <- Map 3 (GROUP) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: inputtbl1 + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Select Operator + expressions: key (type: string) + outputColumnNames: key + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Group By Operator + aggregations: count(1) + keys: key (type: string) + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + value expressions: _col1 (type: bigint) + Map 3 + Map Operator Tree: + TableScan + alias: inputtbl1 + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Select Operator + expressions: key (type: string) + outputColumnNames: key + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Group By Operator + aggregations: count(1) + keys: key (type: string) + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + value expressions: _col1 (type: bigint) + Reducer 2 + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + keys: KEY._col0 (type: string) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col1 (type: bigint) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + Reducer 4 + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + keys: KEY._col0 (type: string) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col1 (type: bigint) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + + Stage: Stage-2 + Dependency Collection + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + +PREHOOK: query: insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values from inputTbl1 group by key +) a +PREHOOK: type: QUERY +PREHOOK: Input: default@inputtbl1 +PREHOOK: Output: default@outputtbl1 +POSTHOOK: query: insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values from inputTbl1 group by key +) a +POSTHOOK: type: QUERY +POSTHOOK: Input: default@inputtbl1 +POSTHOOK: Output: default@outputtbl1 +POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +PREHOOK: query: desc formatted outputTbl1 +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@outputtbl1 +POSTHOOK: query: desc formatted outputTbl1 +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@outputtbl1 +# col_name data_type comment + +key string +values bigint + +# Detailed Table Information +Database: default +#### A masked pattern was here #### +Protect Mode: None +Retention: 0 +#### A masked pattern was here #### +Table Type: MANAGED_TABLE +Table Parameters: + COLUMN_STATS_ACCURATE false + numFiles 2 + numRows -1 + rawDataSize -1 + totalSize 40 +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +InputFormat: org.apache.hadoop.mapred.TextInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: select * from outputTbl1 order by key, values +PREHOOK: type: QUERY +PREHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +POSTHOOK: query: select * from outputTbl1 order by key, values +POSTHOOK: type: QUERY +POSTHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +1 1 +1 1 +2 1 +2 1 +3 1 +3 1 +7 1 +7 1 +8 2 +8 2 diff --git ql/src/test/results/clientpositive/spark/union_remove_10.q.out ql/src/test/results/clientpositive/spark/union_remove_10.q.out new file mode 100644 index 0000000..0808921 --- /dev/null +++ ql/src/test/results/clientpositive/spark/union_remove_10.q.out @@ -0,0 +1,327 @@ +PREHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 2 subqueries is performed (one of which is a map-only query, and the +-- other one contains a nested union where one of the sub-queries requires a map-reduce +-- job), followed by select star and a file sink. +-- There is no need to write the temporary results of the sub-queries, and then read them +-- again to process the union. The outer union can be removed completely. +-- The final file format is different from the input and intermediate file format. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- on + +-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23) +-- Since this test creates sub-directories for the output table outputTbl1, it might be easier +-- to run the test only on hadoop 23 + +create table inputTbl1(key string, val string) stored as textfile +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@inputTbl1 +POSTHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 2 subqueries is performed (one of which is a map-only query, and the +-- other one contains a nested union where one of the sub-queries requires a map-reduce +-- job), followed by select star and a file sink. +-- There is no need to write the temporary results of the sub-queries, and then read them +-- again to process the union. The outer union can be removed completely. +-- The final file format is different from the input and intermediate file format. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- on + +-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23) +-- Since this test creates sub-directories for the output table outputTbl1, it might be easier +-- to run the test only on hadoop 23 + +create table inputTbl1(key string, val string) stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@inputTbl1 +PREHOOK: query: create table outputTbl1(key string, values bigint) stored as rcfile +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@outputTbl1 +POSTHOOK: query: create table outputTbl1(key string, values bigint) stored as rcfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@outputTbl1 +PREHOOK: query: load data local inpath '../../data/files/T1.txt' into table inputTbl1 +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@inputtbl1 +POSTHOOK: query: load data local inpath '../../data/files/T1.txt' into table inputTbl1 +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@inputtbl1 +PREHOOK: query: explain +insert overwrite table outputTbl1 +SELECT * FROM +( +select key, 1 as values from inputTbl1 +union all +select * FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, 2 as values from inputTbl1 +) a +)b +PREHOOK: type: QUERY +POSTHOOK: query: explain +insert overwrite table outputTbl1 +SELECT * FROM +( +select key, 1 as values from inputTbl1 +union all +select * FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, 2 as values from inputTbl1 +) a +)b +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-7 depends on stages: Stage-1 , consists of Stage-4, Stage-3, Stage-5 + Stage-4 + Stage-2 depends on stages: Stage-4, Stage-3, Stage-6, Stage-9, Stage-8, Stage-11 + Stage-0 depends on stages: Stage-2 + Stage-3 + Stage-5 + Stage-6 depends on stages: Stage-5 + Stage-12 depends on stages: Stage-1 , consists of Stage-9, Stage-8, Stage-10 + Stage-9 + Stage-8 + Stage-10 + Stage-11 depends on stages: Stage-10 + +STAGE PLANS: + Stage: Stage-1 + Spark + Edges: + Reducer 2 <- Map 1 (GROUP) + Union 3 <- Map 5 (NONE), Reducer 2 (NONE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: inputtbl1 + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Select Operator + expressions: key (type: string) + outputColumnNames: key + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Group By Operator + aggregations: count(1) + keys: key (type: string) + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + value expressions: _col1 (type: bigint) + Map 4 + Map Operator Tree: + TableScan + alias: inputtbl1 + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Select Operator + expressions: key (type: string), UDFToLong(1) (type: bigint) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.outputtbl1 + Map 5 + Map Operator Tree: + TableScan + alias: inputtbl1 + Select Operator + expressions: key (type: string), UDFToLong(2) (type: bigint) + outputColumnNames: _col0, _col1 + Select Operator + expressions: _col0 (type: string), _col1 (type: bigint) + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.outputtbl1 + Reducer 2 + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + keys: KEY._col0 (type: string) + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: _col0 (type: string), _col1 (type: bigint) + outputColumnNames: _col0, _col1 + Select Operator + expressions: _col0 (type: string), _col1 (type: bigint) + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.outputtbl1 + Union 3 + Vertex: Union 3 + + Stage: Stage-7 + Conditional Operator + + Stage: Stage-4 + Move Operator + files: + hdfs directory: true +#### A masked pattern was here #### + + Stage: Stage-2 + Dependency Collection + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.outputtbl1 + + Stage: Stage-3 + Merge Work + merge level: block + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + + Stage: Stage-5 + Merge Work + merge level: block + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + + Stage: Stage-6 + Move Operator + files: + hdfs directory: true +#### A masked pattern was here #### + + Stage: Stage-12 + Conditional Operator + + Stage: Stage-9 + Move Operator + files: + hdfs directory: true +#### A masked pattern was here #### + + Stage: Stage-8 + Merge Work + merge level: block + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + + Stage: Stage-10 + Merge Work + merge level: block + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + + Stage: Stage-11 + Move Operator + files: + hdfs directory: true +#### A masked pattern was here #### + +PREHOOK: query: insert overwrite table outputTbl1 +SELECT * FROM +( +select key, 1 as values from inputTbl1 +union all +select * FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, 2 as values from inputTbl1 +) a +)b +PREHOOK: type: QUERY +PREHOOK: Input: default@inputtbl1 +PREHOOK: Output: default@outputtbl1 +POSTHOOK: query: insert overwrite table outputTbl1 +SELECT * FROM +( +select key, 1 as values from inputTbl1 +union all +select * FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, 2 as values from inputTbl1 +) a +)b +POSTHOOK: type: QUERY +POSTHOOK: Input: default@inputtbl1 +POSTHOOK: Output: default@outputtbl1 +POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.values EXPRESSION [(inputtbl1)inputtbl1.null, ] +PREHOOK: query: desc formatted outputTbl1 +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@outputtbl1 +POSTHOOK: query: desc formatted outputTbl1 +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@outputtbl1 +# col_name data_type comment + +key string +values bigint + +# Detailed Table Information +Database: default +#### A masked pattern was here #### +Protect Mode: None +Retention: 0 +#### A masked pattern was here #### +Table Type: MANAGED_TABLE +Table Parameters: + COLUMN_STATS_ACCURATE false + numFiles 2 + numRows -1 + rawDataSize -1 + totalSize 180 +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe +InputFormat: org.apache.hadoop.hive.ql.io.RCFileInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.RCFileOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: select * from outputTbl1 order by key, values +PREHOOK: type: QUERY +PREHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +POSTHOOK: query: select * from outputTbl1 order by key, values +POSTHOOK: type: QUERY +POSTHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +1 1 +1 2 +2 1 +2 2 +3 1 +3 2 +7 1 +7 2 +8 2 +8 2 +8 2 diff --git ql/src/test/results/clientpositive/spark/union_remove_11.q.out ql/src/test/results/clientpositive/spark/union_remove_11.q.out new file mode 100644 index 0000000..96651e1 --- /dev/null +++ ql/src/test/results/clientpositive/spark/union_remove_11.q.out @@ -0,0 +1,255 @@ +PREHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 2 subqueries is performed (one of which is a map-only query, and the +-- other one contains a nested union where also contains map only sub-queries), +-- followed by select star and a file sink. +-- There is no need for the union optimization, since the whole query can be performed +-- in a single map-only job +-- The final file format is different from the input and intermediate file format. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- on + +-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23) +-- Since this test creates sub-directories for the output table outputTbl1, it might be easier +-- to run the test only on hadoop 23 + +create table inputTbl1(key string, val string) stored as textfile +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@inputTbl1 +POSTHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 2 subqueries is performed (one of which is a map-only query, and the +-- other one contains a nested union where also contains map only sub-queries), +-- followed by select star and a file sink. +-- There is no need for the union optimization, since the whole query can be performed +-- in a single map-only job +-- The final file format is different from the input and intermediate file format. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- on + +-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23) +-- Since this test creates sub-directories for the output table outputTbl1, it might be easier +-- to run the test only on hadoop 23 + +create table inputTbl1(key string, val string) stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@inputTbl1 +PREHOOK: query: create table outputTbl1(key string, values bigint) stored as rcfile +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@outputTbl1 +POSTHOOK: query: create table outputTbl1(key string, values bigint) stored as rcfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@outputTbl1 +PREHOOK: query: load data local inpath '../../data/files/T1.txt' into table inputTbl1 +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@inputtbl1 +POSTHOOK: query: load data local inpath '../../data/files/T1.txt' into table inputTbl1 +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@inputtbl1 +PREHOOK: query: explain +insert overwrite table outputTbl1 +SELECT * FROM +( +select key, 1 as values from inputTbl1 +union all +select * FROM ( + SELECT key, 2 values from inputTbl1 + UNION ALL + SELECT key, 3 as values from inputTbl1 +) a +)b +PREHOOK: type: QUERY +POSTHOOK: query: explain +insert overwrite table outputTbl1 +SELECT * FROM +( +select key, 1 as values from inputTbl1 +union all +select * FROM ( + SELECT key, 2 values from inputTbl1 + UNION ALL + SELECT key, 3 as values from inputTbl1 +) a +)b +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + +STAGE PLANS: + Stage: Stage-1 + Spark + Edges: + Union 2 <- Map 1 (NONE), Map 3 (NONE), Map 4 (NONE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: inputtbl1 + Select Operator + expressions: key (type: string), 2 (type: int) + outputColumnNames: _col0, _col1 + Select Operator + expressions: _col0 (type: string), _col1 (type: int) + outputColumnNames: _col0, _col1 + Select Operator + expressions: _col0 (type: string), UDFToLong(_col1) (type: bigint) + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.outputtbl1 + Map 3 + Map Operator Tree: + TableScan + alias: inputtbl1 + Select Operator + expressions: key (type: string), 1 (type: int) + outputColumnNames: _col0, _col1 + Select Operator + expressions: _col0 (type: string), UDFToLong(_col1) (type: bigint) + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.outputtbl1 + Map 4 + Map Operator Tree: + TableScan + alias: inputtbl1 + Select Operator + expressions: key (type: string), 3 (type: int) + outputColumnNames: _col0, _col1 + Select Operator + expressions: _col0 (type: string), _col1 (type: int) + outputColumnNames: _col0, _col1 + Select Operator + expressions: _col0 (type: string), UDFToLong(_col1) (type: bigint) + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.outputtbl1 + Union 2 + Vertex: Union 2 + + Stage: Stage-2 + Dependency Collection + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.outputtbl1 + +PREHOOK: query: insert overwrite table outputTbl1 +SELECT * FROM +( +select key, 1 as values from inputTbl1 +union all +select * FROM ( + SELECT key, 2 as values from inputTbl1 + UNION ALL + SELECT key, 3 as values from inputTbl1 +) a +)b +PREHOOK: type: QUERY +PREHOOK: Input: default@inputtbl1 +PREHOOK: Output: default@outputtbl1 +POSTHOOK: query: insert overwrite table outputTbl1 +SELECT * FROM +( +select key, 1 as values from inputTbl1 +union all +select * FROM ( + SELECT key, 2 as values from inputTbl1 + UNION ALL + SELECT key, 3 as values from inputTbl1 +) a +)b +POSTHOOK: type: QUERY +POSTHOOK: Input: default@inputtbl1 +POSTHOOK: Output: default@outputtbl1 +POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.values EXPRESSION [] +PREHOOK: query: desc formatted outputTbl1 +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@outputtbl1 +POSTHOOK: query: desc formatted outputTbl1 +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@outputtbl1 +# col_name data_type comment + +key string +values bigint + +# Detailed Table Information +Database: default +#### A masked pattern was here #### +Protect Mode: None +Retention: 0 +#### A masked pattern was here #### +Table Type: MANAGED_TABLE +Table Parameters: + COLUMN_STATS_ACCURATE false + numFiles 3 + numRows -1 + rawDataSize -1 + totalSize 273 +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe +InputFormat: org.apache.hadoop.hive.ql.io.RCFileInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.RCFileOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: select * from outputTbl1 order by key, values +PREHOOK: type: QUERY +PREHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +POSTHOOK: query: select * from outputTbl1 order by key, values +POSTHOOK: type: QUERY +POSTHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +1 1 +1 2 +1 3 +2 1 +2 2 +2 3 +3 1 +3 2 +3 3 +7 1 +7 2 +7 3 +8 1 +8 1 +8 2 +8 2 +8 3 +8 3 diff --git ql/src/test/results/clientpositive/spark/union_remove_15.q.out ql/src/test/results/clientpositive/spark/union_remove_15.q.out new file mode 100644 index 0000000..463ae63 --- /dev/null +++ ql/src/test/results/clientpositive/spark/union_remove_15.q.out @@ -0,0 +1,278 @@ +PREHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 2 map-reduce subqueries is performed followed by select star and a file sink +-- and the results are written to a table using dynamic partitions. +-- There is no need to write the temporary results of the sub-queries, and then read them +-- again to process the union. The union can be removed completely. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- off +-- This tests demonstrates that this optimization works in the presence of dynamic partitions. + +-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23) +-- Since this test creates sub-directories for the output table outputTbl1, it might be easier +-- to run the test only on hadoop 23 + +create table inputTbl1(key string, val string) stored as textfile +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@inputTbl1 +POSTHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 2 map-reduce subqueries is performed followed by select star and a file sink +-- and the results are written to a table using dynamic partitions. +-- There is no need to write the temporary results of the sub-queries, and then read them +-- again to process the union. The union can be removed completely. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- off +-- This tests demonstrates that this optimization works in the presence of dynamic partitions. + +-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23) +-- Since this test creates sub-directories for the output table outputTbl1, it might be easier +-- to run the test only on hadoop 23 + +create table inputTbl1(key string, val string) stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@inputTbl1 +PREHOOK: query: create table outputTbl1(key string, values bigint) partitioned by (ds string) stored as rcfile +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@outputTbl1 +POSTHOOK: query: create table outputTbl1(key string, values bigint) partitioned by (ds string) stored as rcfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@outputTbl1 +PREHOOK: query: load data local inpath '../../data/files/T1.txt' into table inputTbl1 +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@inputtbl1 +POSTHOOK: query: load data local inpath '../../data/files/T1.txt' into table inputTbl1 +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@inputtbl1 +PREHOOK: query: explain +insert overwrite table outputTbl1 partition (ds) +SELECT * +FROM ( + SELECT key, count(1) as values, '1' as ds from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values, '2' as ds from inputTbl1 group by key +) a +PREHOOK: type: QUERY +POSTHOOK: query: explain +insert overwrite table outputTbl1 partition (ds) +SELECT * +FROM ( + SELECT key, count(1) as values, '1' as ds from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values, '2' as ds from inputTbl1 group by key +) a +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + +STAGE PLANS: + Stage: Stage-1 + Spark + Edges: + Reducer 2 <- Map 1 (GROUP) + Reducer 4 <- Map 3 (GROUP) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: inputtbl1 + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Select Operator + expressions: key (type: string) + outputColumnNames: key + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Group By Operator + aggregations: count(1) + keys: key (type: string) + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + value expressions: _col1 (type: bigint) + Map 3 + Map Operator Tree: + TableScan + alias: inputtbl1 + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Select Operator + expressions: key (type: string) + outputColumnNames: key + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Group By Operator + aggregations: count(1) + keys: key (type: string) + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + value expressions: _col1 (type: bigint) + Reducer 2 + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + keys: KEY._col0 (type: string) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col1 (type: bigint), '2' (type: string) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.outputtbl1 + Reducer 4 + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + keys: KEY._col0 (type: string) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col1 (type: bigint), '1' (type: string) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.outputtbl1 + + Stage: Stage-2 + Dependency Collection + + Stage: Stage-0 + Move Operator + tables: + partition: + ds + replace: true + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.outputtbl1 + +PREHOOK: query: insert overwrite table outputTbl1 partition (ds) +SELECT * +FROM ( + SELECT key, count(1) as values, '1' as ds from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values, '2' as ds from inputTbl1 group by key +) a +PREHOOK: type: QUERY +PREHOOK: Input: default@inputtbl1 +PREHOOK: Output: default@outputtbl1 +POSTHOOK: query: insert overwrite table outputTbl1 partition (ds) +SELECT * +FROM ( + SELECT key, count(1) as values, '1' as ds from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values, '2' as ds from inputTbl1 group by key +) a +POSTHOOK: type: QUERY +POSTHOOK: Input: default@inputtbl1 +POSTHOOK: Output: default@outputtbl1@ds=1 +POSTHOOK: Output: default@outputtbl1@ds=2 +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=1).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=1).values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=2).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=2).values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +PREHOOK: query: desc formatted outputTbl1 +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@outputtbl1 +POSTHOOK: query: desc formatted outputTbl1 +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@outputtbl1 +# col_name data_type comment + +key string +values bigint + +# Partition Information +# col_name data_type comment + +ds string + +# Detailed Table Information +Database: default +#### A masked pattern was here #### +Protect Mode: None +Retention: 0 +#### A masked pattern was here #### +Table Type: MANAGED_TABLE +Table Parameters: +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe +InputFormat: org.apache.hadoop.hive.ql.io.RCFileInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.RCFileOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: show partitions outputTbl1 +PREHOOK: type: SHOWPARTITIONS +PREHOOK: Input: default@outputtbl1 +POSTHOOK: query: show partitions outputTbl1 +POSTHOOK: type: SHOWPARTITIONS +POSTHOOK: Input: default@outputtbl1 +ds=1 +ds=2 +PREHOOK: query: select * from outputTbl1 where ds = '1' order by key, values +PREHOOK: type: QUERY +PREHOOK: Input: default@outputtbl1 +PREHOOK: Input: default@outputtbl1@ds=1 +#### A masked pattern was here #### +POSTHOOK: query: select * from outputTbl1 where ds = '1' order by key, values +POSTHOOK: type: QUERY +POSTHOOK: Input: default@outputtbl1 +POSTHOOK: Input: default@outputtbl1@ds=1 +#### A masked pattern was here #### +1 1 1 +2 1 1 +3 1 1 +7 1 1 +8 2 1 +PREHOOK: query: select * from outputTbl1 where ds = '2' order by key, values +PREHOOK: type: QUERY +PREHOOK: Input: default@outputtbl1 +PREHOOK: Input: default@outputtbl1@ds=2 +#### A masked pattern was here #### +POSTHOOK: query: select * from outputTbl1 where ds = '2' order by key, values +POSTHOOK: type: QUERY +POSTHOOK: Input: default@outputtbl1 +POSTHOOK: Input: default@outputtbl1@ds=2 +#### A masked pattern was here #### +1 1 2 +2 1 2 +3 1 2 +7 1 2 +8 2 2 diff --git ql/src/test/results/clientpositive/spark/union_remove_16.q.out ql/src/test/results/clientpositive/spark/union_remove_16.q.out new file mode 100644 index 0000000..bacb52f --- /dev/null +++ ql/src/test/results/clientpositive/spark/union_remove_16.q.out @@ -0,0 +1,306 @@ +PREHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 2 map-reduce subqueries is performed followed by select star and a file sink +-- and the results are written to a table using dynamic partitions. +-- There is no need to write the temporary results of the sub-queries, and then read them +-- again to process the union. The union can be removed completely. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- on +-- This test demonstrates that this optimization works in the presence of dynamic partitions. +-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23) +-- Since this test creates sub-directories for the output table outputTbl1, it might be easier +-- to run the test only on hadoop 23 + +create table inputTbl1(key string, val string) stored as textfile +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@inputTbl1 +POSTHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 2 map-reduce subqueries is performed followed by select star and a file sink +-- and the results are written to a table using dynamic partitions. +-- There is no need to write the temporary results of the sub-queries, and then read them +-- again to process the union. The union can be removed completely. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- on +-- This test demonstrates that this optimization works in the presence of dynamic partitions. +-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23) +-- Since this test creates sub-directories for the output table outputTbl1, it might be easier +-- to run the test only on hadoop 23 + +create table inputTbl1(key string, val string) stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@inputTbl1 +PREHOOK: query: create table outputTbl1(key string, values bigint) partitioned by (ds string) stored as rcfile +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@outputTbl1 +POSTHOOK: query: create table outputTbl1(key string, values bigint) partitioned by (ds string) stored as rcfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@outputTbl1 +PREHOOK: query: load data local inpath '../../data/files/T1.txt' into table inputTbl1 +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@inputtbl1 +POSTHOOK: query: load data local inpath '../../data/files/T1.txt' into table inputTbl1 +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@inputtbl1 +PREHOOK: query: explain +insert overwrite table outputTbl1 partition (ds) +SELECT * +FROM ( + SELECT key, count(1) as values, '1' as ds from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values, '2' as ds from inputTbl1 group by key +) a +PREHOOK: type: QUERY +POSTHOOK: query: explain +insert overwrite table outputTbl1 partition (ds) +SELECT * +FROM ( + SELECT key, count(1) as values, '1' as ds from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values, '2' as ds from inputTbl1 group by key +) a +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-7 depends on stages: Stage-1 , consists of Stage-4, Stage-3, Stage-5 + Stage-4 + Stage-2 depends on stages: Stage-4, Stage-3, Stage-6 + Stage-0 depends on stages: Stage-2 + Stage-3 + Stage-5 + Stage-6 depends on stages: Stage-5 + +STAGE PLANS: + Stage: Stage-1 + Spark + Edges: + Reducer 2 <- Map 1 (GROUP) + Reducer 4 <- Map 3 (GROUP) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: inputtbl1 + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Select Operator + expressions: key (type: string) + outputColumnNames: key + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Group By Operator + aggregations: count(1) + keys: key (type: string) + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + value expressions: _col1 (type: bigint) + Map 3 + Map Operator Tree: + TableScan + alias: inputtbl1 + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Select Operator + expressions: key (type: string) + outputColumnNames: key + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Group By Operator + aggregations: count(1) + keys: key (type: string) + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + value expressions: _col1 (type: bigint) + Reducer 2 + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + keys: KEY._col0 (type: string) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col1 (type: bigint), '2' (type: string) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.outputtbl1 + Reducer 4 + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + keys: KEY._col0 (type: string) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col1 (type: bigint), '1' (type: string) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.outputtbl1 + + Stage: Stage-7 + Conditional Operator + + Stage: Stage-4 + Move Operator + files: + hdfs directory: true +#### A masked pattern was here #### + + Stage: Stage-2 + Dependency Collection + + Stage: Stage-0 + Move Operator + tables: + partition: + ds + replace: true + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.outputtbl1 + + Stage: Stage-3 + Merge Work + merge level: block + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + + Stage: Stage-5 + Merge Work + merge level: block + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + + Stage: Stage-6 + Move Operator + files: + hdfs directory: true +#### A masked pattern was here #### + +PREHOOK: query: insert overwrite table outputTbl1 partition (ds) +SELECT * +FROM ( + SELECT key, count(1) as values, '1' as ds from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values, '2' as ds from inputTbl1 group by key +) a +PREHOOK: type: QUERY +PREHOOK: Input: default@inputtbl1 +PREHOOK: Output: default@outputtbl1 +POSTHOOK: query: insert overwrite table outputTbl1 partition (ds) +SELECT * +FROM ( + SELECT key, count(1) as values, '1' as ds from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values, '2' as ds from inputTbl1 group by key +) a +POSTHOOK: type: QUERY +POSTHOOK: Input: default@inputtbl1 +POSTHOOK: Output: default@outputtbl1@ds=1 +POSTHOOK: Output: default@outputtbl1@ds=2 +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=1).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=1).values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=2).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=2).values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +PREHOOK: query: desc formatted outputTbl1 +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@outputtbl1 +POSTHOOK: query: desc formatted outputTbl1 +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@outputtbl1 +# col_name data_type comment + +key string +values bigint + +# Partition Information +# col_name data_type comment + +ds string + +# Detailed Table Information +Database: default +#### A masked pattern was here #### +Protect Mode: None +Retention: 0 +#### A masked pattern was here #### +Table Type: MANAGED_TABLE +Table Parameters: +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe +InputFormat: org.apache.hadoop.hive.ql.io.RCFileInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.RCFileOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: show partitions outputTbl1 +PREHOOK: type: SHOWPARTITIONS +PREHOOK: Input: default@outputtbl1 +POSTHOOK: query: show partitions outputTbl1 +POSTHOOK: type: SHOWPARTITIONS +POSTHOOK: Input: default@outputtbl1 +ds=1 +ds=2 +PREHOOK: query: select * from outputTbl1 where ds = '1' order by key, values +PREHOOK: type: QUERY +PREHOOK: Input: default@outputtbl1 +PREHOOK: Input: default@outputtbl1@ds=1 +#### A masked pattern was here #### +POSTHOOK: query: select * from outputTbl1 where ds = '1' order by key, values +POSTHOOK: type: QUERY +POSTHOOK: Input: default@outputtbl1 +POSTHOOK: Input: default@outputtbl1@ds=1 +#### A masked pattern was here #### +1 1 1 +2 1 1 +3 1 1 +7 1 1 +8 2 1 +PREHOOK: query: select * from outputTbl1 where ds = '2' order by key, values +PREHOOK: type: QUERY +PREHOOK: Input: default@outputtbl1 +PREHOOK: Input: default@outputtbl1@ds=2 +#### A masked pattern was here #### +POSTHOOK: query: select * from outputTbl1 where ds = '2' order by key, values +POSTHOOK: type: QUERY +POSTHOOK: Input: default@outputtbl1 +POSTHOOK: Input: default@outputtbl1@ds=2 +#### A masked pattern was here #### +1 1 2 +2 1 2 +3 1 2 +7 1 2 +8 2 2 diff --git ql/src/test/results/clientpositive/spark/union_remove_17.q.out ql/src/test/results/clientpositive/spark/union_remove_17.q.out new file mode 100644 index 0000000..9af6da8 --- /dev/null +++ ql/src/test/results/clientpositive/spark/union_remove_17.q.out @@ -0,0 +1,236 @@ +PREHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 2 map-reduce subqueries is performed followed by select star and a file sink +-- and the results are written to a table using dynamic partitions. +-- There is no need for this optimization, since the query is a map-only query. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- off +-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23) +-- Since this test creates sub-directories for the output table outputTbl1, it might be easier +-- to run the test only on hadoop 23 + +create table inputTbl1(key string, val string) stored as textfile +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@inputTbl1 +POSTHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 2 map-reduce subqueries is performed followed by select star and a file sink +-- and the results are written to a table using dynamic partitions. +-- There is no need for this optimization, since the query is a map-only query. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- off +-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23) +-- Since this test creates sub-directories for the output table outputTbl1, it might be easier +-- to run the test only on hadoop 23 + +create table inputTbl1(key string, val string) stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@inputTbl1 +PREHOOK: query: create table outputTbl1(key string, values bigint) partitioned by (ds string) stored as rcfile +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@outputTbl1 +POSTHOOK: query: create table outputTbl1(key string, values bigint) partitioned by (ds string) stored as rcfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@outputTbl1 +PREHOOK: query: load data local inpath '../../data/files/T1.txt' into table inputTbl1 +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@inputtbl1 +POSTHOOK: query: load data local inpath '../../data/files/T1.txt' into table inputTbl1 +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@inputtbl1 +PREHOOK: query: explain +insert overwrite table outputTbl1 partition (ds) +SELECT * +FROM ( + SELECT key, 1 as values, '1' as ds from inputTbl1 + UNION ALL + SELECT key, 2 as values, '2' as ds from inputTbl1 +) a +PREHOOK: type: QUERY +POSTHOOK: query: explain +insert overwrite table outputTbl1 partition (ds) +SELECT * +FROM ( + SELECT key, 1 as values, '1' as ds from inputTbl1 + UNION ALL + SELECT key, 2 as values, '2' as ds from inputTbl1 +) a +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + +STAGE PLANS: + Stage: Stage-1 + Spark + Edges: + Reducer 3 <- Union 2 (GROUP SORT) + Union 2 <- Map 1 (NONE), Map 4 (NONE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: inputtbl1 + Select Operator + expressions: key (type: string), 2 (type: int), '2' (type: string) + outputColumnNames: _col0, _col1, _col2 + Select Operator + expressions: _col0 (type: string), UDFToLong(_col1) (type: bigint), _col2 (type: string) + outputColumnNames: _col0, _col1, _col2 + Reduce Output Operator + key expressions: _col2 (type: string) + sort order: + + Map-reduce partition columns: _col2 (type: string) + value expressions: _col0 (type: string), _col1 (type: bigint), _col2 (type: string) + Map 4 + Map Operator Tree: + TableScan + alias: inputtbl1 + Select Operator + expressions: key (type: string), 1 (type: int), '1' (type: string) + outputColumnNames: _col0, _col1, _col2 + Select Operator + expressions: _col0 (type: string), UDFToLong(_col1) (type: bigint), _col2 (type: string) + outputColumnNames: _col0, _col1, _col2 + Reduce Output Operator + key expressions: _col2 (type: string) + sort order: + + Map-reduce partition columns: _col2 (type: string) + value expressions: _col0 (type: string), _col1 (type: bigint), _col2 (type: string) + Reducer 3 + Reduce Operator Tree: + Extract + Statistics: Num rows: 0 Data size: 60 Basic stats: PARTIAL Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 0 Data size: 60 Basic stats: PARTIAL Column stats: NONE + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.outputtbl1 + Union 2 + Vertex: Union 2 + + Stage: Stage-2 + Dependency Collection + + Stage: Stage-0 + Move Operator + tables: + partition: + ds + replace: true + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.outputtbl1 + +PREHOOK: query: insert overwrite table outputTbl1 partition (ds) +SELECT * +FROM ( + SELECT key, 1 as values, '1' as ds from inputTbl1 + UNION ALL + SELECT key, 2 as values, '2' as ds from inputTbl1 +) a +PREHOOK: type: QUERY +PREHOOK: Input: default@inputtbl1 +PREHOOK: Output: default@outputtbl1 +POSTHOOK: query: insert overwrite table outputTbl1 partition (ds) +SELECT * +FROM ( + SELECT key, 1 as values, '1' as ds from inputTbl1 + UNION ALL + SELECT key, 2 as values, '2' as ds from inputTbl1 +) a +POSTHOOK: type: QUERY +POSTHOOK: Input: default@inputtbl1 +POSTHOOK: Output: default@outputtbl1@ds=1 +POSTHOOK: Output: default@outputtbl1@ds=2 +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=1).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=1).values EXPRESSION [] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=2).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=2).values EXPRESSION [] +PREHOOK: query: desc formatted outputTbl1 +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@outputtbl1 +POSTHOOK: query: desc formatted outputTbl1 +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@outputtbl1 +# col_name data_type comment + +key string +values bigint + +# Partition Information +# col_name data_type comment + +ds string + +# Detailed Table Information +Database: default +#### A masked pattern was here #### +Protect Mode: None +Retention: 0 +#### A masked pattern was here #### +Table Type: MANAGED_TABLE +Table Parameters: +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe +InputFormat: org.apache.hadoop.hive.ql.io.RCFileInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.RCFileOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: show partitions outputTbl1 +PREHOOK: type: SHOWPARTITIONS +PREHOOK: Input: default@outputtbl1 +POSTHOOK: query: show partitions outputTbl1 +POSTHOOK: type: SHOWPARTITIONS +POSTHOOK: Input: default@outputtbl1 +ds=1 +ds=2 +PREHOOK: query: select * from outputTbl1 where ds = '1' order by key, values +PREHOOK: type: QUERY +PREHOOK: Input: default@outputtbl1 +PREHOOK: Input: default@outputtbl1@ds=1 +#### A masked pattern was here #### +POSTHOOK: query: select * from outputTbl1 where ds = '1' order by key, values +POSTHOOK: type: QUERY +POSTHOOK: Input: default@outputtbl1 +POSTHOOK: Input: default@outputtbl1@ds=1 +#### A masked pattern was here #### +1 1 1 +2 1 1 +3 1 1 +7 1 1 +8 1 1 +8 1 1 +PREHOOK: query: select * from outputTbl1 where ds = '2' order by key, values +PREHOOK: type: QUERY +PREHOOK: Input: default@outputtbl1 +PREHOOK: Input: default@outputtbl1@ds=2 +#### A masked pattern was here #### +POSTHOOK: query: select * from outputTbl1 where ds = '2' order by key, values +POSTHOOK: type: QUERY +POSTHOOK: Input: default@outputtbl1 +POSTHOOK: Input: default@outputtbl1@ds=2 +#### A masked pattern was here #### +1 2 2 +2 2 2 +3 2 2 +7 2 2 +8 2 2 +8 2 2 diff --git ql/src/test/results/clientpositive/spark/union_remove_18.q.out ql/src/test/results/clientpositive/spark/union_remove_18.q.out new file mode 100644 index 0000000..fba6634 --- /dev/null +++ ql/src/test/results/clientpositive/spark/union_remove_18.q.out @@ -0,0 +1,318 @@ +PREHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 2 map-reduce subqueries is performed followed by select star and a file sink +-- There is no need to write the temporary results of the sub-queries, and then read them +-- again to process the union. The union can be removed completely. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- off +-- This test demonstrates that the optimization works with dynamic partitions irrespective of the +-- file format of the output file +-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23) +-- Since this test creates sub-directories for the output table outputTbl1, it might be easier +-- to run the test only on hadoop 23 + +create table inputTbl1(key string, ds string) stored as textfile +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@inputTbl1 +POSTHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 2 map-reduce subqueries is performed followed by select star and a file sink +-- There is no need to write the temporary results of the sub-queries, and then read them +-- again to process the union. The union can be removed completely. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- off +-- This test demonstrates that the optimization works with dynamic partitions irrespective of the +-- file format of the output file +-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23) +-- Since this test creates sub-directories for the output table outputTbl1, it might be easier +-- to run the test only on hadoop 23 + +create table inputTbl1(key string, ds string) stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@inputTbl1 +PREHOOK: query: create table outputTbl1(key string, values bigint) partitioned by (ds string) stored as textfile +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@outputTbl1 +POSTHOOK: query: create table outputTbl1(key string, values bigint) partitioned by (ds string) stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@outputTbl1 +PREHOOK: query: load data local inpath '../../data/files/T1.txt' into table inputTbl1 +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@inputtbl1 +POSTHOOK: query: load data local inpath '../../data/files/T1.txt' into table inputTbl1 +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@inputtbl1 +PREHOOK: query: explain +insert overwrite table outputTbl1 partition (ds) +SELECT * +FROM ( + SELECT key, count(1) as values, ds from inputTbl1 group by key, ds + UNION ALL + SELECT key, count(1) as values, ds from inputTbl1 group by key, ds +) a +PREHOOK: type: QUERY +POSTHOOK: query: explain +insert overwrite table outputTbl1 partition (ds) +SELECT * +FROM ( + SELECT key, count(1) as values, ds from inputTbl1 group by key, ds + UNION ALL + SELECT key, count(1) as values, ds from inputTbl1 group by key, ds +) a +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + +STAGE PLANS: + Stage: Stage-1 + Spark + Edges: + Reducer 2 <- Map 1 (GROUP) + Reducer 4 <- Map 3 (GROUP) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: inputtbl1 + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Select Operator + expressions: key (type: string), ds (type: string) + outputColumnNames: key, ds + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Group By Operator + aggregations: count(1) + keys: key (type: string), ds (type: string) + mode: hash + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string), _col1 (type: string) + sort order: ++ + Map-reduce partition columns: _col0 (type: string), _col1 (type: string) + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + value expressions: _col2 (type: bigint) + Map 3 + Map Operator Tree: + TableScan + alias: inputtbl1 + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Select Operator + expressions: key (type: string), ds (type: string) + outputColumnNames: key, ds + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Group By Operator + aggregations: count(1) + keys: key (type: string), ds (type: string) + mode: hash + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string), _col1 (type: string) + sort order: ++ + Map-reduce partition columns: _col0 (type: string), _col1 (type: string) + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + value expressions: _col2 (type: bigint) + Reducer 2 + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + keys: KEY._col0 (type: string), KEY._col1 (type: string) + mode: mergepartial + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col2 (type: bigint), _col1 (type: string) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + Reducer 4 + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + keys: KEY._col0 (type: string), KEY._col1 (type: string) + mode: mergepartial + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col2 (type: bigint), _col1 (type: string) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + + Stage: Stage-2 + Dependency Collection + + Stage: Stage-0 + Move Operator + tables: + partition: + ds + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + +PREHOOK: query: insert overwrite table outputTbl1 partition (ds) +SELECT * +FROM ( + SELECT key, count(1) as values, ds from inputTbl1 group by key, ds + UNION ALL + SELECT key, count(1) as values, ds from inputTbl1 group by key, ds +) a +PREHOOK: type: QUERY +PREHOOK: Input: default@inputtbl1 +PREHOOK: Output: default@outputtbl1 +POSTHOOK: query: insert overwrite table outputTbl1 partition (ds) +SELECT * +FROM ( + SELECT key, count(1) as values, ds from inputTbl1 group by key, ds + UNION ALL + SELECT key, count(1) as values, ds from inputTbl1 group by key, ds +) a +POSTHOOK: type: QUERY +POSTHOOK: Input: default@inputtbl1 +POSTHOOK: Output: default@outputtbl1@ds=11 +POSTHOOK: Output: default@outputtbl1@ds=12 +POSTHOOK: Output: default@outputtbl1@ds=13 +POSTHOOK: Output: default@outputtbl1@ds=17 +POSTHOOK: Output: default@outputtbl1@ds=18 +POSTHOOK: Output: default@outputtbl1@ds=28 +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=11).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=11).values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=12).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=12).values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=13).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=13).values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=17).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=17).values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=18).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=18).values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=28).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=28).values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +PREHOOK: query: desc formatted outputTbl1 +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@outputtbl1 +POSTHOOK: query: desc formatted outputTbl1 +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@outputtbl1 +# col_name data_type comment + +key string +values bigint + +# Partition Information +# col_name data_type comment + +ds string + +# Detailed Table Information +Database: default +#### A masked pattern was here #### +Protect Mode: None +Retention: 0 +#### A masked pattern was here #### +Table Type: MANAGED_TABLE +Table Parameters: +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +InputFormat: org.apache.hadoop.mapred.TextInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: show partitions outputTbl1 +PREHOOK: type: SHOWPARTITIONS +PREHOOK: Input: default@outputtbl1 +POSTHOOK: query: show partitions outputTbl1 +POSTHOOK: type: SHOWPARTITIONS +POSTHOOK: Input: default@outputtbl1 +ds=11 +ds=12 +ds=13 +ds=17 +ds=18 +ds=28 +PREHOOK: query: select * from outputTbl1 where ds = '11' order by key, values +PREHOOK: type: QUERY +PREHOOK: Input: default@outputtbl1 +PREHOOK: Input: default@outputtbl1@ds=11 +#### A masked pattern was here #### +POSTHOOK: query: select * from outputTbl1 where ds = '11' order by key, values +POSTHOOK: type: QUERY +POSTHOOK: Input: default@outputtbl1 +POSTHOOK: Input: default@outputtbl1@ds=11 +#### A masked pattern was here #### +1 1 11 +1 1 11 +PREHOOK: query: select * from outputTbl1 where ds = '18' order by key, values +PREHOOK: type: QUERY +PREHOOK: Input: default@outputtbl1 +PREHOOK: Input: default@outputtbl1@ds=18 +#### A masked pattern was here #### +POSTHOOK: query: select * from outputTbl1 where ds = '18' order by key, values +POSTHOOK: type: QUERY +POSTHOOK: Input: default@outputtbl1 +POSTHOOK: Input: default@outputtbl1@ds=18 +#### A masked pattern was here #### +8 1 18 +8 1 18 +PREHOOK: query: select * from outputTbl1 where ds is not null order by key, values, ds +PREHOOK: type: QUERY +PREHOOK: Input: default@outputtbl1 +PREHOOK: Input: default@outputtbl1@ds=11 +PREHOOK: Input: default@outputtbl1@ds=12 +PREHOOK: Input: default@outputtbl1@ds=13 +PREHOOK: Input: default@outputtbl1@ds=17 +PREHOOK: Input: default@outputtbl1@ds=18 +PREHOOK: Input: default@outputtbl1@ds=28 +#### A masked pattern was here #### +POSTHOOK: query: select * from outputTbl1 where ds is not null order by key, values, ds +POSTHOOK: type: QUERY +POSTHOOK: Input: default@outputtbl1 +POSTHOOK: Input: default@outputtbl1@ds=11 +POSTHOOK: Input: default@outputtbl1@ds=12 +POSTHOOK: Input: default@outputtbl1@ds=13 +POSTHOOK: Input: default@outputtbl1@ds=17 +POSTHOOK: Input: default@outputtbl1@ds=18 +POSTHOOK: Input: default@outputtbl1@ds=28 +#### A masked pattern was here #### +1 1 11 +1 1 11 +2 1 12 +2 1 12 +3 1 13 +3 1 13 +7 1 17 +7 1 17 +8 1 18 +8 1 18 +8 1 28 +8 1 28 diff --git ql/src/test/results/clientpositive/spark/union_remove_19.q.out ql/src/test/results/clientpositive/spark/union_remove_19.q.out new file mode 100644 index 0000000..8dd4057 --- /dev/null +++ ql/src/test/results/clientpositive/spark/union_remove_19.q.out @@ -0,0 +1,603 @@ +PREHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 2 map-reduce subqueries is performed followed by select star and a file sink +-- There is no need to write the temporary results of the sub-queries, and then read them +-- again to process the union. The union can be removed completely. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- off +-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23) +-- Since this test creates sub-directories for the output table outputTbl1, it might be easier +-- to run the test only on hadoop 23 + +create table inputTbl1(key string, val string) stored as textfile +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@inputTbl1 +POSTHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 2 map-reduce subqueries is performed followed by select star and a file sink +-- There is no need to write the temporary results of the sub-queries, and then read them +-- again to process the union. The union can be removed completely. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- off +-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23) +-- Since this test creates sub-directories for the output table outputTbl1, it might be easier +-- to run the test only on hadoop 23 + +create table inputTbl1(key string, val string) stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@inputTbl1 +PREHOOK: query: create table outputTbl1(key string, values bigint) stored as textfile +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@outputTbl1 +POSTHOOK: query: create table outputTbl1(key string, values bigint) stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@outputTbl1 +PREHOOK: query: load data local inpath '../../data/files/T1.txt' into table inputTbl1 +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@inputtbl1 +POSTHOOK: query: load data local inpath '../../data/files/T1.txt' into table inputTbl1 +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@inputtbl1 +PREHOOK: query: explain +insert overwrite table outputTbl1 +SELECT a.key, a.values +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values from inputTbl1 group by key +) a +PREHOOK: type: QUERY +POSTHOOK: query: explain +insert overwrite table outputTbl1 +SELECT a.key, a.values +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values from inputTbl1 group by key +) a +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + +STAGE PLANS: + Stage: Stage-1 + Spark + Edges: + Reducer 2 <- Map 1 (GROUP) + Reducer 4 <- Map 3 (GROUP) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: inputtbl1 + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Select Operator + expressions: key (type: string) + outputColumnNames: key + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Group By Operator + aggregations: count(1) + keys: key (type: string) + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + value expressions: _col1 (type: bigint) + Map 3 + Map Operator Tree: + TableScan + alias: inputtbl1 + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Select Operator + expressions: key (type: string) + outputColumnNames: key + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Group By Operator + aggregations: count(1) + keys: key (type: string) + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + value expressions: _col1 (type: bigint) + Reducer 2 + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + keys: KEY._col0 (type: string) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col1 (type: bigint) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + Reducer 4 + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + keys: KEY._col0 (type: string) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col1 (type: bigint) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + + Stage: Stage-2 + Dependency Collection + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + +PREHOOK: query: insert overwrite table outputTbl1 +SELECT a.key, a.values +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values from inputTbl1 group by key +) a +PREHOOK: type: QUERY +PREHOOK: Input: default@inputtbl1 +PREHOOK: Output: default@outputtbl1 +POSTHOOK: query: insert overwrite table outputTbl1 +SELECT a.key, a.values +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values from inputTbl1 group by key +) a +POSTHOOK: type: QUERY +POSTHOOK: Input: default@inputtbl1 +POSTHOOK: Output: default@outputtbl1 +POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +PREHOOK: query: desc formatted outputTbl1 +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@outputtbl1 +POSTHOOK: query: desc formatted outputTbl1 +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@outputtbl1 +# col_name data_type comment + +key string +values bigint + +# Detailed Table Information +Database: default +#### A masked pattern was here #### +Protect Mode: None +Retention: 0 +#### A masked pattern was here #### +Table Type: MANAGED_TABLE +Table Parameters: + COLUMN_STATS_ACCURATE false + numFiles 2 + numRows -1 + rawDataSize -1 + totalSize 40 +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +InputFormat: org.apache.hadoop.mapred.TextInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: select * from outputTbl1 +PREHOOK: type: QUERY +PREHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +POSTHOOK: query: select * from outputTbl1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +7 1 +2 1 +8 2 +3 1 +1 1 +7 1 +2 1 +8 2 +3 1 +1 1 +PREHOOK: query: -- filter should be fine +explain +insert overwrite table outputTbl1 +SELECT a.key, a.values +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values from inputTbl1 group by key +) a where a.key = 7 +PREHOOK: type: QUERY +POSTHOOK: query: -- filter should be fine +explain +insert overwrite table outputTbl1 +SELECT a.key, a.values +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values from inputTbl1 group by key +) a where a.key = 7 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + +STAGE PLANS: + Stage: Stage-1 + Spark + Edges: + Reducer 2 <- Map 1 (GROUP) + Reducer 4 <- Map 3 (GROUP) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: inputtbl1 + Statistics: Num rows: 30 Data size: 30 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (key = 7) (type: boolean) + Statistics: Num rows: 15 Data size: 15 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: '7' (type: string) + outputColumnNames: key + Statistics: Num rows: 15 Data size: 15 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count(1) + keys: key (type: string) + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 15 Data size: 15 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 15 Data size: 15 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: bigint) + Map 3 + Map Operator Tree: + TableScan + alias: inputtbl1 + Statistics: Num rows: 30 Data size: 30 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (key = 7) (type: boolean) + Statistics: Num rows: 15 Data size: 15 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: '7' (type: string) + outputColumnNames: key + Statistics: Num rows: 15 Data size: 15 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count(1) + keys: key (type: string) + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 15 Data size: 15 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 15 Data size: 15 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: bigint) + Reducer 2 + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + keys: KEY._col0 (type: string) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 7 Data size: 7 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col1 (type: bigint) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 7 Data size: 7 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 7 Data size: 7 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + Reducer 4 + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + keys: KEY._col0 (type: string) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 7 Data size: 7 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col1 (type: bigint) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 7 Data size: 7 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 7 Data size: 7 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + + Stage: Stage-2 + Dependency Collection + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + +PREHOOK: query: insert overwrite table outputTbl1 +SELECT a.key, a.values +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values from inputTbl1 group by key +) a where a.key = 7 +PREHOOK: type: QUERY +PREHOOK: Input: default@inputtbl1 +PREHOOK: Output: default@outputtbl1 +POSTHOOK: query: insert overwrite table outputTbl1 +SELECT a.key, a.values +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values from inputTbl1 group by key +) a where a.key = 7 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@inputtbl1 +POSTHOOK: Output: default@outputtbl1 +POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +PREHOOK: query: select * from outputTbl1 +PREHOOK: type: QUERY +PREHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +POSTHOOK: query: select * from outputTbl1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +7 1 +7 1 +PREHOOK: query: -- filters and sub-queries should be fine +explain +insert overwrite table outputTbl1 +select key, values from +( +SELECT a.key + a.key as key, a.values +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values from inputTbl1 group by key +) a +) b where b.key >= 7 +PREHOOK: type: QUERY +POSTHOOK: query: -- filters and sub-queries should be fine +explain +insert overwrite table outputTbl1 +select key, values from +( +SELECT a.key + a.key as key, a.values +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values from inputTbl1 group by key +) a +) b where b.key >= 7 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + +STAGE PLANS: + Stage: Stage-1 + Spark + Edges: + Reducer 2 <- Map 1 (GROUP) + Reducer 4 <- Map 3 (GROUP) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: inputtbl1 + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Select Operator + expressions: key (type: string) + outputColumnNames: key + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Group By Operator + aggregations: count(1) + keys: key (type: string) + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + value expressions: _col1 (type: bigint) + Map 3 + Map Operator Tree: + TableScan + alias: inputtbl1 + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Select Operator + expressions: key (type: string) + outputColumnNames: key + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Group By Operator + aggregations: count(1) + keys: key (type: string) + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + value expressions: _col1 (type: bigint) + Reducer 2 + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + keys: KEY._col0 (type: string) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Select Operator + expressions: (_col0 + _col0) (type: double), _col1 (type: bigint) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Filter Operator + predicate: (_col0 >= 7.0) (type: boolean) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Select Operator + expressions: _col0 (type: double), _col1 (type: bigint) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + Reducer 4 + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + keys: KEY._col0 (type: string) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Select Operator + expressions: (_col0 + _col0) (type: double), _col1 (type: bigint) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Filter Operator + predicate: (_col0 >= 7.0) (type: boolean) + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Select Operator + expressions: _col0 (type: double), _col1 (type: bigint) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + + Stage: Stage-2 + Dependency Collection + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + +PREHOOK: query: insert overwrite table outputTbl1 +select key, values from +( +SELECT a.key + a.key as key, a.values +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values from inputTbl1 group by key +) a +) b where b.key >= 7 +PREHOOK: type: QUERY +PREHOOK: Input: default@inputtbl1 +PREHOOK: Output: default@outputtbl1 +POSTHOOK: query: insert overwrite table outputTbl1 +select key, values from +( +SELECT a.key + a.key as key, a.values +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values from inputTbl1 group by key +) a +) b where b.key >= 7 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@inputtbl1 +POSTHOOK: Output: default@outputtbl1 +POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +PREHOOK: query: select * from outputTbl1 order by key, values +PREHOOK: type: QUERY +PREHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +POSTHOOK: query: select * from outputTbl1 order by key, values +POSTHOOK: type: QUERY +POSTHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +14.0 1 +14.0 1 +16.0 2 +16.0 2 diff --git ql/src/test/results/clientpositive/spark/union_remove_20.q.out ql/src/test/results/clientpositive/spark/union_remove_20.q.out new file mode 100644 index 0000000..68de623 --- /dev/null +++ ql/src/test/results/clientpositive/spark/union_remove_20.q.out @@ -0,0 +1,249 @@ +PREHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 2 map-reduce subqueries is performed followed by select and a file sink +-- However, the order of the columns in the select list is different. So, union cannot +-- be removed. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- off +-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23) +-- Since this test creates sub-directories for the output table outputTbl1, it might be easier +-- to run the test only on hadoop 23. The union is removed, the select (which changes the order of +-- columns being selected) is pushed above the union. + +create table inputTbl1(key string, val string) stored as textfile +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@inputTbl1 +POSTHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 2 map-reduce subqueries is performed followed by select and a file sink +-- However, the order of the columns in the select list is different. So, union cannot +-- be removed. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- off +-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23) +-- Since this test creates sub-directories for the output table outputTbl1, it might be easier +-- to run the test only on hadoop 23. The union is removed, the select (which changes the order of +-- columns being selected) is pushed above the union. + +create table inputTbl1(key string, val string) stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@inputTbl1 +PREHOOK: query: create table outputTbl1(values bigint, key string) stored as textfile +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@outputTbl1 +POSTHOOK: query: create table outputTbl1(values bigint, key string) stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@outputTbl1 +PREHOOK: query: load data local inpath '../../data/files/T1.txt' into table inputTbl1 +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@inputtbl1 +POSTHOOK: query: load data local inpath '../../data/files/T1.txt' into table inputTbl1 +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@inputtbl1 +PREHOOK: query: explain +insert overwrite table outputTbl1 +SELECT a.values, a.key +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values from inputTbl1 group by key +) a +PREHOOK: type: QUERY +POSTHOOK: query: explain +insert overwrite table outputTbl1 +SELECT a.values, a.key +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values from inputTbl1 group by key +) a +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + +STAGE PLANS: + Stage: Stage-1 + Spark + Edges: + Reducer 2 <- Map 1 (GROUP) + Reducer 4 <- Map 3 (GROUP) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: inputtbl1 + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Select Operator + expressions: key (type: string) + outputColumnNames: key + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Group By Operator + aggregations: count(1) + keys: key (type: string) + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + value expressions: _col1 (type: bigint) + Map 3 + Map Operator Tree: + TableScan + alias: inputtbl1 + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Select Operator + expressions: key (type: string) + outputColumnNames: key + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Group By Operator + aggregations: count(1) + keys: key (type: string) + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + value expressions: _col1 (type: bigint) + Reducer 2 + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + keys: KEY._col0 (type: string) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Select Operator + expressions: _col1 (type: bigint), _col0 (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + Reducer 4 + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + keys: KEY._col0 (type: string) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Select Operator + expressions: _col1 (type: bigint), _col0 (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + + Stage: Stage-2 + Dependency Collection + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + +PREHOOK: query: insert overwrite table outputTbl1 +SELECT a.values, a.key +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values from inputTbl1 group by key +) a +PREHOOK: type: QUERY +PREHOOK: Input: default@inputtbl1 +PREHOOK: Output: default@outputtbl1 +POSTHOOK: query: insert overwrite table outputTbl1 +SELECT a.values, a.key +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values from inputTbl1 group by key +) a +POSTHOOK: type: QUERY +POSTHOOK: Input: default@inputtbl1 +POSTHOOK: Output: default@outputtbl1 +POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +PREHOOK: query: desc formatted outputTbl1 +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@outputtbl1 +POSTHOOK: query: desc formatted outputTbl1 +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@outputtbl1 +# col_name data_type comment + +values bigint +key string + +# Detailed Table Information +Database: default +#### A masked pattern was here #### +Protect Mode: None +Retention: 0 +#### A masked pattern was here #### +Table Type: MANAGED_TABLE +Table Parameters: + COLUMN_STATS_ACCURATE false + numFiles 2 + numRows -1 + rawDataSize -1 + totalSize 40 +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +InputFormat: org.apache.hadoop.mapred.TextInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: select * from outputTbl1 order by key, values +PREHOOK: type: QUERY +PREHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +POSTHOOK: query: select * from outputTbl1 order by key, values +POSTHOOK: type: QUERY +POSTHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +1 1 +1 1 +1 2 +1 2 +1 3 +1 3 +1 7 +1 7 +2 8 +2 8 diff --git ql/src/test/results/clientpositive/spark/union_remove_21.q.out ql/src/test/results/clientpositive/spark/union_remove_21.q.out new file mode 100644 index 0000000..384f191 --- /dev/null +++ ql/src/test/results/clientpositive/spark/union_remove_21.q.out @@ -0,0 +1,247 @@ +PREHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 2 map-reduce subqueries is performed followed by select and a file sink +-- However, all the columns are not selected. So, union cannot +-- be removed. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- off +-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23) +-- Since this test creates sub-directories for the output table outputTbl1, it might be easier +-- to run the test only on hadoop 23. The union is removed, the select (which changes the order of +-- columns being selected) is pushed above the union. + +create table inputTbl1(key string, val string) stored as textfile +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@inputTbl1 +POSTHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 2 map-reduce subqueries is performed followed by select and a file sink +-- However, all the columns are not selected. So, union cannot +-- be removed. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- off +-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23) +-- Since this test creates sub-directories for the output table outputTbl1, it might be easier +-- to run the test only on hadoop 23. The union is removed, the select (which changes the order of +-- columns being selected) is pushed above the union. + +create table inputTbl1(key string, val string) stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@inputTbl1 +PREHOOK: query: create table outputTbl1(key string) stored as textfile +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@outputTbl1 +POSTHOOK: query: create table outputTbl1(key string) stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@outputTbl1 +PREHOOK: query: load data local inpath '../../data/files/T1.txt' into table inputTbl1 +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@inputtbl1 +POSTHOOK: query: load data local inpath '../../data/files/T1.txt' into table inputTbl1 +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@inputtbl1 +PREHOOK: query: explain +insert overwrite table outputTbl1 +SELECT a.key +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values from inputTbl1 group by key +) a +PREHOOK: type: QUERY +POSTHOOK: query: explain +insert overwrite table outputTbl1 +SELECT a.key +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values from inputTbl1 group by key +) a +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + +STAGE PLANS: + Stage: Stage-1 + Spark + Edges: + Reducer 2 <- Map 1 (GROUP) + Reducer 4 <- Map 3 (GROUP) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: inputtbl1 + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Select Operator + expressions: key (type: string) + outputColumnNames: key + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Group By Operator + aggregations: count(1) + keys: key (type: string) + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + value expressions: _col1 (type: bigint) + Map 3 + Map Operator Tree: + TableScan + alias: inputtbl1 + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Select Operator + expressions: key (type: string) + outputColumnNames: key + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Group By Operator + aggregations: count(1) + keys: key (type: string) + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + value expressions: _col1 (type: bigint) + Reducer 2 + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + keys: KEY._col0 (type: string) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Select Operator + expressions: _col0 (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + Reducer 4 + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + keys: KEY._col0 (type: string) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Select Operator + expressions: _col0 (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + + Stage: Stage-2 + Dependency Collection + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + +PREHOOK: query: insert overwrite table outputTbl1 +SELECT a.key +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values from inputTbl1 group by key +) a +PREHOOK: type: QUERY +PREHOOK: Input: default@inputtbl1 +PREHOOK: Output: default@outputtbl1 +POSTHOOK: query: insert overwrite table outputTbl1 +SELECT a.key +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values from inputTbl1 group by key +) a +POSTHOOK: type: QUERY +POSTHOOK: Input: default@inputtbl1 +POSTHOOK: Output: default@outputtbl1 +POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +PREHOOK: query: desc formatted outputTbl1 +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@outputtbl1 +POSTHOOK: query: desc formatted outputTbl1 +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@outputtbl1 +# col_name data_type comment + +key string + +# Detailed Table Information +Database: default +#### A masked pattern was here #### +Protect Mode: None +Retention: 0 +#### A masked pattern was here #### +Table Type: MANAGED_TABLE +Table Parameters: + COLUMN_STATS_ACCURATE false + numFiles 2 + numRows -1 + rawDataSize -1 + totalSize 20 +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +InputFormat: org.apache.hadoop.mapred.TextInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: select * from outputTbl1 order by key +PREHOOK: type: QUERY +PREHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +POSTHOOK: query: select * from outputTbl1 order by key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +1 +1 +2 +2 +3 +3 +7 +7 +8 +8 diff --git ql/src/test/results/clientpositive/spark/union_remove_22.q.out ql/src/test/results/clientpositive/spark/union_remove_22.q.out new file mode 100644 index 0000000..0f10ab4 --- /dev/null +++ ql/src/test/results/clientpositive/spark/union_remove_22.q.out @@ -0,0 +1,417 @@ +PREHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 2 map-reduce subqueries is performed followed by select and a file sink +-- However, some columns are repeated. So, union cannot be removed. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- off +-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23) +-- Since this test creates sub-directories for the output table outputTbl1, it might be easier +-- to run the test only on hadoop 23. The union is removed, the select (which selects columns from +-- both the sub-qeuries of the union) is pushed above the union. + +create table inputTbl1(key string, val string) stored as textfile +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@inputTbl1 +POSTHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 2 map-reduce subqueries is performed followed by select and a file sink +-- However, some columns are repeated. So, union cannot be removed. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- off +-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23) +-- Since this test creates sub-directories for the output table outputTbl1, it might be easier +-- to run the test only on hadoop 23. The union is removed, the select (which selects columns from +-- both the sub-qeuries of the union) is pushed above the union. + +create table inputTbl1(key string, val string) stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@inputTbl1 +PREHOOK: query: create table outputTbl1(key string, values bigint, values2 bigint) stored as textfile +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@outputTbl1 +POSTHOOK: query: create table outputTbl1(key string, values bigint, values2 bigint) stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@outputTbl1 +PREHOOK: query: load data local inpath '../../data/files/T1.txt' into table inputTbl1 +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@inputtbl1 +POSTHOOK: query: load data local inpath '../../data/files/T1.txt' into table inputTbl1 +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@inputtbl1 +PREHOOK: query: explain +insert overwrite table outputTbl1 +SELECT a.key, a.values, a.values +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values from inputTbl1 group by key +) a +PREHOOK: type: QUERY +POSTHOOK: query: explain +insert overwrite table outputTbl1 +SELECT a.key, a.values, a.values +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values from inputTbl1 group by key +) a +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + +STAGE PLANS: + Stage: Stage-1 + Spark + Edges: + Reducer 2 <- Map 1 (GROUP) + Reducer 4 <- Map 3 (GROUP) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: inputtbl1 + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Select Operator + expressions: key (type: string) + outputColumnNames: key + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Group By Operator + aggregations: count(1) + keys: key (type: string) + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + value expressions: _col1 (type: bigint) + Map 3 + Map Operator Tree: + TableScan + alias: inputtbl1 + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Select Operator + expressions: key (type: string) + outputColumnNames: key + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Group By Operator + aggregations: count(1) + keys: key (type: string) + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + value expressions: _col1 (type: bigint) + Reducer 2 + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + keys: KEY._col0 (type: string) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col1 (type: bigint), _col1 (type: bigint) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + Reducer 4 + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + keys: KEY._col0 (type: string) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col1 (type: bigint), _col1 (type: bigint) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + + Stage: Stage-2 + Dependency Collection + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + +PREHOOK: query: insert overwrite table outputTbl1 +SELECT a.key, a.values, a.values +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values from inputTbl1 group by key +) a +PREHOOK: type: QUERY +PREHOOK: Input: default@inputtbl1 +PREHOOK: Output: default@outputtbl1 +POSTHOOK: query: insert overwrite table outputTbl1 +SELECT a.key, a.values, a.values +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values from inputTbl1 group by key +) a +POSTHOOK: type: QUERY +POSTHOOK: Input: default@inputtbl1 +POSTHOOK: Output: default@outputtbl1 +POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +POSTHOOK: Lineage: outputtbl1.values2 EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +PREHOOK: query: desc formatted outputTbl1 +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@outputtbl1 +POSTHOOK: query: desc formatted outputTbl1 +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@outputtbl1 +# col_name data_type comment + +key string +values bigint +values2 bigint + +# Detailed Table Information +Database: default +#### A masked pattern was here #### +Protect Mode: None +Retention: 0 +#### A masked pattern was here #### +Table Type: MANAGED_TABLE +Table Parameters: + COLUMN_STATS_ACCURATE false + numFiles 2 + numRows -1 + rawDataSize -1 + totalSize 60 +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +InputFormat: org.apache.hadoop.mapred.TextInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: select * from outputTbl1 +PREHOOK: type: QUERY +PREHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +POSTHOOK: query: select * from outputTbl1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +7 1 1 +2 1 1 +8 2 2 +3 1 1 +1 1 1 +7 1 1 +2 1 1 +8 2 2 +3 1 1 +1 1 1 +PREHOOK: query: explain +insert overwrite table outputTbl1 +SELECT a.key, concat(a.values, a.values), concat(a.values, a.values) +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values from inputTbl1 group by key +) a +PREHOOK: type: QUERY +POSTHOOK: query: explain +insert overwrite table outputTbl1 +SELECT a.key, concat(a.values, a.values), concat(a.values, a.values) +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values from inputTbl1 group by key +) a +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + +STAGE PLANS: + Stage: Stage-1 + Spark + Edges: + Reducer 2 <- Map 1 (GROUP) + Reducer 4 <- Map 3 (GROUP) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: inputtbl1 + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Select Operator + expressions: key (type: string) + outputColumnNames: key + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Group By Operator + aggregations: count(1) + keys: key (type: string) + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + value expressions: _col1 (type: bigint) + Map 3 + Map Operator Tree: + TableScan + alias: inputtbl1 + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Select Operator + expressions: key (type: string) + outputColumnNames: key + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Group By Operator + aggregations: count(1) + keys: key (type: string) + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + value expressions: _col1 (type: bigint) + Reducer 2 + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + keys: KEY._col0 (type: string) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Select Operator + expressions: _col0 (type: string), UDFToLong(concat(_col1, _col1)) (type: bigint), UDFToLong(concat(_col1, _col1)) (type: bigint) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + Reducer 4 + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + keys: KEY._col0 (type: string) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Select Operator + expressions: _col0 (type: string), UDFToLong(concat(_col1, _col1)) (type: bigint), UDFToLong(concat(_col1, _col1)) (type: bigint) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + + Stage: Stage-2 + Dependency Collection + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + +PREHOOK: query: insert overwrite table outputTbl1 +SELECT a.key, concat(a.values, a.values), concat(a.values, a.values) +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values from inputTbl1 group by key +) a +PREHOOK: type: QUERY +PREHOOK: Input: default@inputtbl1 +PREHOOK: Output: default@outputtbl1 +POSTHOOK: query: insert overwrite table outputTbl1 +SELECT a.key, concat(a.values, a.values), concat(a.values, a.values) +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values from inputTbl1 group by key +) a +POSTHOOK: type: QUERY +POSTHOOK: Input: default@inputtbl1 +POSTHOOK: Output: default@outputtbl1 +POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +POSTHOOK: Lineage: outputtbl1.values2 EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +PREHOOK: query: select * from outputTbl1 order by key, values +PREHOOK: type: QUERY +PREHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +POSTHOOK: query: select * from outputTbl1 order by key, values +POSTHOOK: type: QUERY +POSTHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +1 11 11 +1 11 11 +2 11 11 +2 11 11 +3 11 11 +3 11 11 +7 11 11 +7 11 11 +8 22 22 +8 22 22 diff --git ql/src/test/results/clientpositive/spark/union_remove_24.q.out ql/src/test/results/clientpositive/spark/union_remove_24.q.out new file mode 100644 index 0000000..0494612 --- /dev/null +++ ql/src/test/results/clientpositive/spark/union_remove_24.q.out @@ -0,0 +1,245 @@ +PREHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 2 map-reduce subqueries is performed followed by select star and a file sink +-- There is no need to write the temporary results of the sub-queries, and then read them +-- again to process the union. The union can be removed completely. +-- One sub-query has a double and the other sub-query has a bigint. +-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23) +-- Since this test creates sub-directories for the output table outputTbl1, it might be easier +-- to run the test only on hadoop 23 + +create table inputTbl1(key string, val string) stored as textfile +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@inputTbl1 +POSTHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 2 map-reduce subqueries is performed followed by select star and a file sink +-- There is no need to write the temporary results of the sub-queries, and then read them +-- again to process the union. The union can be removed completely. +-- One sub-query has a double and the other sub-query has a bigint. +-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23) +-- Since this test creates sub-directories for the output table outputTbl1, it might be easier +-- to run the test only on hadoop 23 + +create table inputTbl1(key string, val string) stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@inputTbl1 +PREHOOK: query: create table outputTbl1(key double, values bigint) stored as textfile +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@outputTbl1 +POSTHOOK: query: create table outputTbl1(key double, values bigint) stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@outputTbl1 +PREHOOK: query: load data local inpath '../../data/files/T1.txt' into table inputTbl1 +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@inputtbl1 +POSTHOOK: query: load data local inpath '../../data/files/T1.txt' into table inputTbl1 +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@inputtbl1 +PREHOOK: query: EXPLAIN +INSERT OVERWRITE TABLE outputTbl1 +SELECT * FROM +( + SELECT CAST(key AS DOUBLE) AS key, count(1) as values FROM inputTbl1 group by key + UNION ALL + SELECT CAST(key AS BIGINT) AS key, count(1) as values FROM inputTbl1 group by key +) a +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN +INSERT OVERWRITE TABLE outputTbl1 +SELECT * FROM +( + SELECT CAST(key AS DOUBLE) AS key, count(1) as values FROM inputTbl1 group by key + UNION ALL + SELECT CAST(key AS BIGINT) AS key, count(1) as values FROM inputTbl1 group by key +) a +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + +STAGE PLANS: + Stage: Stage-1 + Spark + Edges: + Reducer 2 <- Map 1 (GROUP) + Reducer 4 <- Map 3 (GROUP) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: inputtbl1 + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Select Operator + expressions: key (type: string) + outputColumnNames: key + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Group By Operator + aggregations: count(1) + keys: key (type: string) + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + value expressions: _col1 (type: bigint) + Map 3 + Map Operator Tree: + TableScan + alias: inputtbl1 + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Select Operator + expressions: key (type: string) + outputColumnNames: key + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Group By Operator + aggregations: count(1) + keys: key (type: string) + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + value expressions: _col1 (type: bigint) + Reducer 2 + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + keys: KEY._col0 (type: string) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Select Operator + expressions: UDFToDouble(UDFToLong(_col0)) (type: double), _col1 (type: bigint) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + Reducer 4 + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + keys: KEY._col0 (type: string) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Select Operator + expressions: UDFToDouble(_col0) (type: double), _col1 (type: bigint) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + + Stage: Stage-2 + Dependency Collection + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + +PREHOOK: query: INSERT OVERWRITE TABLE outputTbl1 +SELECT * FROM +( + SELECT CAST(key AS DOUBLE) AS key, count(1) as values FROM inputTbl1 group by key + UNION ALL + SELECT CAST(key AS BIGINT) AS key, count(1) as values FROM inputTbl1 group by key +) a +PREHOOK: type: QUERY +PREHOOK: Input: default@inputtbl1 +PREHOOK: Output: default@outputtbl1 +POSTHOOK: query: INSERT OVERWRITE TABLE outputTbl1 +SELECT * FROM +( + SELECT CAST(key AS DOUBLE) AS key, count(1) as values FROM inputTbl1 group by key + UNION ALL + SELECT CAST(key AS BIGINT) AS key, count(1) as values FROM inputTbl1 group by key +) a +POSTHOOK: type: QUERY +POSTHOOK: Input: default@inputtbl1 +POSTHOOK: Output: default@outputtbl1 +POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +PREHOOK: query: desc formatted outputTbl1 +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@outputtbl1 +POSTHOOK: query: desc formatted outputTbl1 +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@outputtbl1 +# col_name data_type comment + +key double +values bigint + +# Detailed Table Information +Database: default +#### A masked pattern was here #### +Protect Mode: None +Retention: 0 +#### A masked pattern was here #### +Table Type: MANAGED_TABLE +Table Parameters: + COLUMN_STATS_ACCURATE false + numFiles 2 + numRows -1 + rawDataSize -1 + totalSize 60 +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +InputFormat: org.apache.hadoop.mapred.TextInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: select * from outputTbl1 order by key, values +PREHOOK: type: QUERY +PREHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +POSTHOOK: query: select * from outputTbl1 order by key, values +POSTHOOK: type: QUERY +POSTHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +1.0 1 +1.0 1 +2.0 1 +2.0 1 +3.0 1 +3.0 1 +7.0 1 +7.0 1 +8.0 2 +8.0 2 diff --git ql/src/test/results/clientpositive/spark/union_remove_25.q.out ql/src/test/results/clientpositive/spark/union_remove_25.q.out new file mode 100644 index 0000000..c58c90a --- /dev/null +++ ql/src/test/results/clientpositive/spark/union_remove_25.q.out @@ -0,0 +1,666 @@ +PREHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 2 map-reduce subqueries is performed followed by select star and a file sink +-- There is no need to write the temporary results of the sub-queries, and then read them +-- again to process the union. The union can be removed completely. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- off +-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23) +-- Since this test creates sub-directories for the output table outputTbl1, it might be easier +-- to run the test only on hadoop 23 + +create table inputTbl1(key string, val string) stored as textfile +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@inputTbl1 +POSTHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 2 map-reduce subqueries is performed followed by select star and a file sink +-- There is no need to write the temporary results of the sub-queries, and then read them +-- again to process the union. The union can be removed completely. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- off +-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23) +-- Since this test creates sub-directories for the output table outputTbl1, it might be easier +-- to run the test only on hadoop 23 + +create table inputTbl1(key string, val string) stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@inputTbl1 +PREHOOK: query: create table outputTbl1(key string, values bigint) partitioned by (ds string) stored as textfile +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@outputTbl1 +POSTHOOK: query: create table outputTbl1(key string, values bigint) partitioned by (ds string) stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@outputTbl1 +PREHOOK: query: create table outputTbl2(key string, values bigint) partitioned by (ds string) stored as textfile +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@outputTbl2 +POSTHOOK: query: create table outputTbl2(key string, values bigint) partitioned by (ds string) stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@outputTbl2 +PREHOOK: query: create table outputTbl3(key string, values bigint) partitioned by (ds string,hr string) stored as textfile +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@outputTbl3 +POSTHOOK: query: create table outputTbl3(key string, values bigint) partitioned by (ds string,hr string) stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@outputTbl3 +PREHOOK: query: load data local inpath '../../data/files/T1.txt' into table inputTbl1 +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@inputtbl1 +POSTHOOK: query: load data local inpath '../../data/files/T1.txt' into table inputTbl1 +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@inputtbl1 +PREHOOK: query: explain +insert overwrite table outputTbl1 partition(ds='2004') +SELECT * +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values from inputTbl1 group by key +) a +PREHOOK: type: QUERY +POSTHOOK: query: explain +insert overwrite table outputTbl1 partition(ds='2004') +SELECT * +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values from inputTbl1 group by key +) a +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + +STAGE PLANS: + Stage: Stage-1 + Spark + Edges: + Reducer 2 <- Map 1 (GROUP) + Reducer 4 <- Map 3 (GROUP) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: inputtbl1 + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Select Operator + expressions: key (type: string) + outputColumnNames: key + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Group By Operator + aggregations: count(1) + keys: key (type: string) + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + value expressions: _col1 (type: bigint) + Map 3 + Map Operator Tree: + TableScan + alias: inputtbl1 + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Select Operator + expressions: key (type: string) + outputColumnNames: key + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Group By Operator + aggregations: count(1) + keys: key (type: string) + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + value expressions: _col1 (type: bigint) + Reducer 2 + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + keys: KEY._col0 (type: string) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col1 (type: bigint) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + Reducer 4 + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + keys: KEY._col0 (type: string) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col1 (type: bigint) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + + Stage: Stage-2 + Dependency Collection + + Stage: Stage-0 + Move Operator + tables: + partition: + ds 2004 + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + +PREHOOK: query: insert overwrite table outputTbl1 partition(ds='2004') +SELECT * +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values from inputTbl1 group by key +) a +PREHOOK: type: QUERY +PREHOOK: Input: default@inputtbl1 +PREHOOK: Output: default@outputtbl1@ds=2004 +POSTHOOK: query: insert overwrite table outputTbl1 partition(ds='2004') +SELECT * +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values from inputTbl1 group by key +) a +POSTHOOK: type: QUERY +POSTHOOK: Input: default@inputtbl1 +POSTHOOK: Output: default@outputtbl1@ds=2004 +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=2004).key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1 PARTITION(ds=2004).values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +PREHOOK: query: desc formatted outputTbl1 partition(ds='2004') +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@outputtbl1 +POSTHOOK: query: desc formatted outputTbl1 partition(ds='2004') +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@outputtbl1 +# col_name data_type comment + +key string +values bigint + +# Partition Information +# col_name data_type comment + +ds string + +# Detailed Partition Information +Partition Value: [2004] +Database: default +Table: outputtbl1 +#### A masked pattern was here #### +Protect Mode: None +#### A masked pattern was here #### +Partition Parameters: + COLUMN_STATS_ACCURATE false + numFiles 2 + numRows -1 + rawDataSize -1 + totalSize 40 +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +InputFormat: org.apache.hadoop.mapred.TextInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: select * from outputTbl1 order by key, values +PREHOOK: type: QUERY +PREHOOK: Input: default@outputtbl1 +PREHOOK: Input: default@outputtbl1@ds=2004 +#### A masked pattern was here #### +POSTHOOK: query: select * from outputTbl1 order by key, values +POSTHOOK: type: QUERY +POSTHOOK: Input: default@outputtbl1 +POSTHOOK: Input: default@outputtbl1@ds=2004 +#### A masked pattern was here #### +1 1 2004 +1 1 2004 +2 1 2004 +2 1 2004 +3 1 2004 +3 1 2004 +7 1 2004 +7 1 2004 +8 2 2004 +8 2 2004 +PREHOOK: query: explain +insert overwrite table outputTbl2 partition(ds) +SELECT * +FROM ( + SELECT key, value, ds from srcpart where ds='2008-04-08' limit 500 + UNION ALL + SELECT key, value, ds from srcpart where ds='2008-04-08' limit 500 +) a +PREHOOK: type: QUERY +POSTHOOK: query: explain +insert overwrite table outputTbl2 partition(ds) +SELECT * +FROM ( + SELECT key, value, ds from srcpart where ds='2008-04-08' limit 500 + UNION ALL + SELECT key, value, ds from srcpart where ds='2008-04-08' limit 500 +) a +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + +STAGE PLANS: + Stage: Stage-1 + Spark + Edges: + Reducer 2 <- Map 1 (GROUP) + Reducer 4 <- Map 3 (GROUP) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: srcpart + Statistics: Num rows: 58 Data size: 11624 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string), value (type: string), ds (type: string) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 58 Data size: 11624 Basic stats: COMPLETE Column stats: NONE + Limit + Number of rows: 500 + Statistics: Num rows: 58 Data size: 11624 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 58 Data size: 11624 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: string), _col1 (type: string), _col2 (type: string) + Map 3 + Map Operator Tree: + TableScan + alias: srcpart + Statistics: Num rows: 58 Data size: 11624 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string), value (type: string), ds (type: string) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 58 Data size: 11624 Basic stats: COMPLETE Column stats: NONE + Limit + Number of rows: 500 + Statistics: Num rows: 58 Data size: 11624 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 58 Data size: 11624 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: string), _col1 (type: string), _col2 (type: string) + Reducer 2 + Reduce Operator Tree: + Select Operator + expressions: VALUE._col0 (type: string), VALUE._col1 (type: string), VALUE._col2 (type: string) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 58 Data size: 11624 Basic stats: COMPLETE Column stats: NONE + Limit + Number of rows: 500 + Statistics: Num rows: 58 Data size: 11624 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: string), UDFToLong(_col1) (type: bigint), _col2 (type: string) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 58 Data size: 11624 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 58 Data size: 11624 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl2 + Reducer 4 + Reduce Operator Tree: + Select Operator + expressions: VALUE._col0 (type: string), VALUE._col1 (type: string), VALUE._col2 (type: string) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 58 Data size: 11624 Basic stats: COMPLETE Column stats: NONE + Limit + Number of rows: 500 + Statistics: Num rows: 58 Data size: 11624 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: string), UDFToLong(_col1) (type: bigint), _col2 (type: string) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 58 Data size: 11624 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 58 Data size: 11624 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl2 + + Stage: Stage-2 + Dependency Collection + + Stage: Stage-0 + Move Operator + tables: + partition: + ds + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl2 + +PREHOOK: query: insert overwrite table outputTbl2 partition(ds) +SELECT * +FROM ( + SELECT key, value, ds from srcpart where ds='2008-04-08' limit 500 + UNION ALL + SELECT key, value, ds from srcpart where ds='2008-04-08' limit 500 +) a +PREHOOK: type: QUERY +PREHOOK: Input: default@srcpart +PREHOOK: Input: default@srcpart@ds=2008-04-08/hr=11 +PREHOOK: Input: default@srcpart@ds=2008-04-08/hr=12 +PREHOOK: Output: default@outputtbl2 +POSTHOOK: query: insert overwrite table outputTbl2 partition(ds) +SELECT * +FROM ( + SELECT key, value, ds from srcpart where ds='2008-04-08' limit 500 + UNION ALL + SELECT key, value, ds from srcpart where ds='2008-04-08' limit 500 +) a +POSTHOOK: type: QUERY +POSTHOOK: Input: default@srcpart +POSTHOOK: Input: default@srcpart@ds=2008-04-08/hr=11 +POSTHOOK: Input: default@srcpart@ds=2008-04-08/hr=12 +POSTHOOK: Output: default@outputtbl2@ds=2008-04-08 +POSTHOOK: Lineage: outputtbl2 PARTITION(ds=2008-04-08).key EXPRESSION [(srcpart)srcpart.FieldSchema(name:key, type:string, comment:default), (srcpart)srcpart.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: outputtbl2 PARTITION(ds=2008-04-08).values EXPRESSION [(srcpart)srcpart.FieldSchema(name:value, type:string, comment:default), (srcpart)srcpart.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: show partitions outputTbl2 +PREHOOK: type: SHOWPARTITIONS +PREHOOK: Input: default@outputtbl2 +POSTHOOK: query: show partitions outputTbl2 +POSTHOOK: type: SHOWPARTITIONS +POSTHOOK: Input: default@outputtbl2 +ds=2008-04-08 +PREHOOK: query: desc formatted outputTbl2 partition(ds='2008-04-08') +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@outputtbl2 +POSTHOOK: query: desc formatted outputTbl2 partition(ds='2008-04-08') +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@outputtbl2 +# col_name data_type comment + +key string +values bigint + +# Partition Information +# col_name data_type comment + +ds string + +# Detailed Partition Information +Partition Value: [2008-04-08] +Database: default +Table: outputtbl2 +#### A masked pattern was here #### +Protect Mode: None +#### A masked pattern was here #### +Partition Parameters: + COLUMN_STATS_ACCURATE false + numFiles 2 + numRows -1 + rawDataSize -1 + totalSize 6812 +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +InputFormat: org.apache.hadoop.mapred.TextInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: explain insert overwrite table outputTbl3 partition(ds, hr) +SELECT * +FROM ( + SELECT key, value, ds, hr from srcpart where ds='2008-04-08' limit 1000 + UNION ALL + SELECT key, value, ds, hr from srcpart where ds='2008-04-08' limit 1000 +) a +PREHOOK: type: QUERY +POSTHOOK: query: explain insert overwrite table outputTbl3 partition(ds, hr) +SELECT * +FROM ( + SELECT key, value, ds, hr from srcpart where ds='2008-04-08' limit 1000 + UNION ALL + SELECT key, value, ds, hr from srcpart where ds='2008-04-08' limit 1000 +) a +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + +STAGE PLANS: + Stage: Stage-1 + Spark + Edges: + Reducer 2 <- Map 1 (GROUP) + Reducer 4 <- Map 3 (GROUP) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: srcpart + Statistics: Num rows: 58 Data size: 11624 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string), value (type: string), ds (type: string), hr (type: string) + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 58 Data size: 11624 Basic stats: COMPLETE Column stats: NONE + Limit + Number of rows: 1000 + Statistics: Num rows: 58 Data size: 11624 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 58 Data size: 11624 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: string), _col1 (type: string), _col2 (type: string), _col3 (type: string) + Map 3 + Map Operator Tree: + TableScan + alias: srcpart + Statistics: Num rows: 58 Data size: 11624 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string), value (type: string), ds (type: string), hr (type: string) + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 58 Data size: 11624 Basic stats: COMPLETE Column stats: NONE + Limit + Number of rows: 1000 + Statistics: Num rows: 58 Data size: 11624 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 58 Data size: 11624 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: string), _col1 (type: string), _col2 (type: string), _col3 (type: string) + Reducer 2 + Reduce Operator Tree: + Select Operator + expressions: VALUE._col0 (type: string), VALUE._col1 (type: string), VALUE._col2 (type: string), VALUE._col3 (type: string) + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 58 Data size: 11624 Basic stats: COMPLETE Column stats: NONE + Limit + Number of rows: 1000 + Statistics: Num rows: 58 Data size: 11624 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: string), UDFToLong(_col1) (type: bigint), _col2 (type: string), _col3 (type: string) + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 58 Data size: 11624 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 58 Data size: 11624 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl3 + Reducer 4 + Reduce Operator Tree: + Select Operator + expressions: VALUE._col0 (type: string), VALUE._col1 (type: string), VALUE._col2 (type: string), VALUE._col3 (type: string) + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 58 Data size: 11624 Basic stats: COMPLETE Column stats: NONE + Limit + Number of rows: 1000 + Statistics: Num rows: 58 Data size: 11624 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: string), UDFToLong(_col1) (type: bigint), _col2 (type: string), _col3 (type: string) + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 58 Data size: 11624 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 58 Data size: 11624 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl3 + + Stage: Stage-2 + Dependency Collection + + Stage: Stage-0 + Move Operator + tables: + partition: + ds + hr + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl3 + +PREHOOK: query: insert overwrite table outputTbl3 partition(ds, hr) +SELECT * +FROM ( + SELECT key, value, ds, hr from srcpart where ds='2008-04-08' limit 1000 + UNION ALL + SELECT key, value, ds, hr from srcpart where ds='2008-04-08' limit 1000 +) a +PREHOOK: type: QUERY +PREHOOK: Input: default@srcpart +PREHOOK: Input: default@srcpart@ds=2008-04-08/hr=11 +PREHOOK: Input: default@srcpart@ds=2008-04-08/hr=12 +PREHOOK: Output: default@outputtbl3 +POSTHOOK: query: insert overwrite table outputTbl3 partition(ds, hr) +SELECT * +FROM ( + SELECT key, value, ds, hr from srcpart where ds='2008-04-08' limit 1000 + UNION ALL + SELECT key, value, ds, hr from srcpart where ds='2008-04-08' limit 1000 +) a +POSTHOOK: type: QUERY +POSTHOOK: Input: default@srcpart +POSTHOOK: Input: default@srcpart@ds=2008-04-08/hr=11 +POSTHOOK: Input: default@srcpart@ds=2008-04-08/hr=12 +POSTHOOK: Output: default@outputtbl3@ds=2008-04-08/hr=11 +POSTHOOK: Output: default@outputtbl3@ds=2008-04-08/hr=12 +POSTHOOK: Lineage: outputtbl3 PARTITION(ds=2008-04-08,hr=11).key EXPRESSION [(srcpart)srcpart.FieldSchema(name:key, type:string, comment:default), (srcpart)srcpart.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: outputtbl3 PARTITION(ds=2008-04-08,hr=11).values EXPRESSION [(srcpart)srcpart.FieldSchema(name:value, type:string, comment:default), (srcpart)srcpart.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: outputtbl3 PARTITION(ds=2008-04-08,hr=12).key EXPRESSION [(srcpart)srcpart.FieldSchema(name:key, type:string, comment:default), (srcpart)srcpart.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: outputtbl3 PARTITION(ds=2008-04-08,hr=12).values EXPRESSION [(srcpart)srcpart.FieldSchema(name:value, type:string, comment:default), (srcpart)srcpart.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: show partitions outputTbl3 +PREHOOK: type: SHOWPARTITIONS +PREHOOK: Input: default@outputtbl3 +POSTHOOK: query: show partitions outputTbl3 +POSTHOOK: type: SHOWPARTITIONS +POSTHOOK: Input: default@outputtbl3 +ds=2008-04-08/hr=11 +ds=2008-04-08/hr=12 +PREHOOK: query: desc formatted outputTbl3 partition(ds='2008-04-08', hr='11') +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@outputtbl3 +POSTHOOK: query: desc formatted outputTbl3 partition(ds='2008-04-08', hr='11') +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@outputtbl3 +# col_name data_type comment + +key string +values bigint + +# Partition Information +# col_name data_type comment + +ds string +hr string + +# Detailed Partition Information +Partition Value: [2008-04-08, 11] +Database: default +Table: outputtbl3 +#### A masked pattern was here #### +Protect Mode: None +#### A masked pattern was here #### +Partition Parameters: + COLUMN_STATS_ACCURATE false + numFiles 2 + numRows -1 + rawDataSize -1 + totalSize 6812 +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +InputFormat: org.apache.hadoop.mapred.TextInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 diff --git ql/src/test/results/clientpositive/spark/union_remove_3.q.out ql/src/test/results/clientpositive/spark/union_remove_3.q.out new file mode 100644 index 0000000..11c352d --- /dev/null +++ ql/src/test/results/clientpositive/spark/union_remove_3.q.out @@ -0,0 +1,235 @@ +PREHOOK: query: -- This is to test the union->remove->filesink optimization +-- Union of 3 subqueries is performed (all of which are map-only queries) +-- followed by select star and a file sink. +-- There is no need for any optimization, since the whole query can be processed in +-- a single map-only job +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- off +-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23) +-- Since this test creates sub-directories for the output table outputTbl1, it might be easier +-- to run the test only on hadoop 23 + +create table inputTbl1(key string, val string) stored as textfile +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@inputTbl1 +POSTHOOK: query: -- This is to test the union->remove->filesink optimization +-- Union of 3 subqueries is performed (all of which are map-only queries) +-- followed by select star and a file sink. +-- There is no need for any optimization, since the whole query can be processed in +-- a single map-only job +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- off +-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23) +-- Since this test creates sub-directories for the output table outputTbl1, it might be easier +-- to run the test only on hadoop 23 + +create table inputTbl1(key string, val string) stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@inputTbl1 +PREHOOK: query: create table outputTbl1(key string, values bigint) stored as textfile +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@outputTbl1 +POSTHOOK: query: create table outputTbl1(key string, values bigint) stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@outputTbl1 +PREHOOK: query: load data local inpath '../../data/files/T1.txt' into table inputTbl1 +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@inputtbl1 +POSTHOOK: query: load data local inpath '../../data/files/T1.txt' into table inputTbl1 +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@inputtbl1 +PREHOOK: query: explain +insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, 1 as values from inputTbl1 + UNION ALL + SELECT key, 2 as values from inputTbl1 + UNION ALL + SELECT key, 3 as values from inputTbl1 +) a +PREHOOK: type: QUERY +POSTHOOK: query: explain +insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, 1 as values from inputTbl1 + UNION ALL + SELECT key, 2 as values from inputTbl1 + UNION ALL + SELECT key, 3 as values from inputTbl1 +) a +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + +STAGE PLANS: + Stage: Stage-1 + Spark + Edges: + Union 2 <- Map 1 (NONE), Map 3 (NONE), Map 4 (NONE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: inputtbl1 + Select Operator + expressions: key (type: string), 3 (type: int) + outputColumnNames: _col0, _col1 + Select Operator + expressions: _col0 (type: string), UDFToLong(_col1) (type: bigint) + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + Map 3 + Map Operator Tree: + TableScan + alias: inputtbl1 + Select Operator + expressions: key (type: string), 1 (type: int) + outputColumnNames: _col0, _col1 + Select Operator + expressions: _col0 (type: string), UDFToLong(_col1) (type: bigint) + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + Map 4 + Map Operator Tree: + TableScan + alias: inputtbl1 + Select Operator + expressions: key (type: string), 2 (type: int) + outputColumnNames: _col0, _col1 + Select Operator + expressions: _col0 (type: string), UDFToLong(_col1) (type: bigint) + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + Union 2 + Vertex: Union 2 + + Stage: Stage-2 + Dependency Collection + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + +PREHOOK: query: insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, 1 as values from inputTbl1 + UNION ALL + SELECT key, 2 as values from inputTbl1 + UNION ALL + SELECT key, 3 as values from inputTbl1 +) a +PREHOOK: type: QUERY +PREHOOK: Input: default@inputtbl1 +PREHOOK: Output: default@outputtbl1 +POSTHOOK: query: insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, 1 as values from inputTbl1 + UNION ALL + SELECT key, 2 as values from inputTbl1 + UNION ALL + SELECT key, 3 as values from inputTbl1 +) a +POSTHOOK: type: QUERY +POSTHOOK: Input: default@inputtbl1 +POSTHOOK: Output: default@outputtbl1 +POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.values EXPRESSION [] +PREHOOK: query: desc formatted outputTbl1 +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@outputtbl1 +POSTHOOK: query: desc formatted outputTbl1 +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@outputtbl1 +# col_name data_type comment + +key string +values bigint + +# Detailed Table Information +Database: default +#### A masked pattern was here #### +Protect Mode: None +Retention: 0 +#### A masked pattern was here #### +Table Type: MANAGED_TABLE +Table Parameters: + COLUMN_STATS_ACCURATE false + numFiles 3 + numRows -1 + rawDataSize -1 + totalSize 72 +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +InputFormat: org.apache.hadoop.mapred.TextInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: select * from outputTbl1 order by key, values +PREHOOK: type: QUERY +PREHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +POSTHOOK: query: select * from outputTbl1 order by key, values +POSTHOOK: type: QUERY +POSTHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +1 1 +1 2 +1 3 +2 1 +2 2 +2 3 +3 1 +3 2 +3 3 +7 1 +7 2 +7 3 +8 1 +8 1 +8 2 +8 2 +8 3 +8 3 diff --git ql/src/test/results/clientpositive/spark/union_remove_4.q.out ql/src/test/results/clientpositive/spark/union_remove_4.q.out new file mode 100644 index 0000000..a429f3e --- /dev/null +++ ql/src/test/results/clientpositive/spark/union_remove_4.q.out @@ -0,0 +1,297 @@ +PREHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 2 map-reduce subqueries is performed followed by select star and a file sink +-- There is no need to write the temporary results of the sub-queries, and then read them +-- again to process the union. The union can be removed completely. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- on +-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23) +-- Since this test creates sub-directories for the output table outputTbl1, it might be easier +-- to run the test only on hadoop 23 + +create table inputTbl1(key string, val string) stored as textfile +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@inputTbl1 +POSTHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 2 map-reduce subqueries is performed followed by select star and a file sink +-- There is no need to write the temporary results of the sub-queries, and then read them +-- again to process the union. The union can be removed completely. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- on +-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23) +-- Since this test creates sub-directories for the output table outputTbl1, it might be easier +-- to run the test only on hadoop 23 + +create table inputTbl1(key string, val string) stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@inputTbl1 +PREHOOK: query: create table outputTbl1(key string, values bigint) stored as textfile +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@outputTbl1 +POSTHOOK: query: create table outputTbl1(key string, values bigint) stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@outputTbl1 +PREHOOK: query: load data local inpath '../../data/files/T1.txt' into table inputTbl1 +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@inputtbl1 +POSTHOOK: query: load data local inpath '../../data/files/T1.txt' into table inputTbl1 +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@inputtbl1 +PREHOOK: query: explain +insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values from inputTbl1 group by key +) a +PREHOOK: type: QUERY +POSTHOOK: query: explain +insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values from inputTbl1 group by key +) a +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-7 depends on stages: Stage-1 , consists of Stage-4, Stage-3, Stage-5 + Stage-4 + Stage-2 depends on stages: Stage-4, Stage-3, Stage-6 + Stage-0 depends on stages: Stage-2 + Stage-3 + Stage-5 + Stage-6 depends on stages: Stage-5 + +STAGE PLANS: + Stage: Stage-1 + Spark + Edges: + Reducer 2 <- Map 1 (GROUP) + Reducer 4 <- Map 3 (GROUP) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: inputtbl1 + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Select Operator + expressions: key (type: string) + outputColumnNames: key + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Group By Operator + aggregations: count(1) + keys: key (type: string) + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + value expressions: _col1 (type: bigint) + Map 3 + Map Operator Tree: + TableScan + alias: inputtbl1 + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Select Operator + expressions: key (type: string) + outputColumnNames: key + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Group By Operator + aggregations: count(1) + keys: key (type: string) + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + value expressions: _col1 (type: bigint) + Reducer 2 + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + keys: KEY._col0 (type: string) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col1 (type: bigint) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + Reducer 4 + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + keys: KEY._col0 (type: string) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col1 (type: bigint) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + + Stage: Stage-7 + Conditional Operator + + Stage: Stage-4 + Move Operator + files: + hdfs directory: true +#### A masked pattern was here #### + + Stage: Stage-2 + Dependency Collection + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + + Stage: Stage-3 + Spark +#### A masked pattern was here #### + Vertices: + Merge + Map Operator Tree: + TableScan + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + + Stage: Stage-5 + Spark +#### A masked pattern was here #### + Vertices: + Merge + Map Operator Tree: + TableScan + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + + Stage: Stage-6 + Move Operator + files: + hdfs directory: true +#### A masked pattern was here #### + +PREHOOK: query: insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values from inputTbl1 group by key +) a +PREHOOK: type: QUERY +PREHOOK: Input: default@inputtbl1 +PREHOOK: Output: default@outputtbl1 +POSTHOOK: query: insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values from inputTbl1 group by key +) a +POSTHOOK: type: QUERY +POSTHOOK: Input: default@inputtbl1 +POSTHOOK: Output: default@outputtbl1 +POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +PREHOOK: query: desc formatted outputTbl1 +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@outputtbl1 +POSTHOOK: query: desc formatted outputTbl1 +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@outputtbl1 +# col_name data_type comment + +key string +values bigint + +# Detailed Table Information +Database: default +#### A masked pattern was here #### +Protect Mode: None +Retention: 0 +#### A masked pattern was here #### +Table Type: MANAGED_TABLE +Table Parameters: + COLUMN_STATS_ACCURATE false + numFiles 2 + numRows -1 + rawDataSize -1 + totalSize 40 +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +InputFormat: org.apache.hadoop.mapred.TextInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: select * from outputTbl1 order by key, values +PREHOOK: type: QUERY +PREHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +POSTHOOK: query: select * from outputTbl1 order by key, values +POSTHOOK: type: QUERY +POSTHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +1 1 +1 1 +2 1 +2 1 +3 1 +3 1 +7 1 +7 1 +8 2 +8 2 diff --git ql/src/test/results/clientpositive/spark/union_remove_5.q.out ql/src/test/results/clientpositive/spark/union_remove_5.q.out new file mode 100644 index 0000000..c2c7c08 --- /dev/null +++ ql/src/test/results/clientpositive/spark/union_remove_5.q.out @@ -0,0 +1,308 @@ +PREHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 3 subqueries is performed (exactly one of which requires a map-reduce job) +-- followed by select star and a file sink. +-- There is no need to write the temporary results of the sub-queries, and then read them +-- again to process the union. The union can be removed completely. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- on + +-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23) +-- Since this test creates sub-directories for the output table outputTbl1, it might be easier +-- to run the test only on hadoop 23 + +create table inputTbl1(key string, val string) stored as textfile +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@inputTbl1 +POSTHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 3 subqueries is performed (exactly one of which requires a map-reduce job) +-- followed by select star and a file sink. +-- There is no need to write the temporary results of the sub-queries, and then read them +-- again to process the union. The union can be removed completely. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- on + +-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23) +-- Since this test creates sub-directories for the output table outputTbl1, it might be easier +-- to run the test only on hadoop 23 + +create table inputTbl1(key string, val string) stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@inputTbl1 +PREHOOK: query: create table outputTbl1(key string, values bigint) stored as textfile +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@outputTbl1 +POSTHOOK: query: create table outputTbl1(key string, values bigint) stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@outputTbl1 +PREHOOK: query: load data local inpath '../../data/files/T1.txt' into table inputTbl1 +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@inputtbl1 +POSTHOOK: query: load data local inpath '../../data/files/T1.txt' into table inputTbl1 +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@inputtbl1 +PREHOOK: query: explain +insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, 1 as values from inputTbl1 + UNION ALL + SELECT key, 2 as values from inputTbl1 +) a +PREHOOK: type: QUERY +POSTHOOK: query: explain +insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, 1 as values from inputTbl1 + UNION ALL + SELECT key, 2 as values from inputTbl1 +) a +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-7 depends on stages: Stage-1 , consists of Stage-4, Stage-3, Stage-5 + Stage-4 + Stage-2 depends on stages: Stage-4, Stage-3, Stage-6 + Stage-0 depends on stages: Stage-2 + Stage-3 + Stage-5 + Stage-6 depends on stages: Stage-5 + +STAGE PLANS: + Stage: Stage-1 + Spark + Edges: + Reducer 3 <- Map 2 (GROUP) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: inputtbl1 + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Select Operator + expressions: key (type: string), UDFToLong(2) (type: bigint) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + Map 2 + Map Operator Tree: + TableScan + alias: inputtbl1 + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Select Operator + expressions: key (type: string) + outputColumnNames: key + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Group By Operator + aggregations: count(1) + keys: key (type: string) + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + value expressions: _col1 (type: bigint) + Map 4 + Map Operator Tree: + TableScan + alias: inputtbl1 + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Select Operator + expressions: key (type: string), UDFToLong(1) (type: bigint) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + Reducer 3 + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + keys: KEY._col0 (type: string) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col1 (type: bigint) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + + Stage: Stage-7 + Conditional Operator + + Stage: Stage-4 + Move Operator + files: + hdfs directory: true +#### A masked pattern was here #### + + Stage: Stage-2 + Dependency Collection + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + + Stage: Stage-3 + Spark +#### A masked pattern was here #### + Vertices: + Merge + Map Operator Tree: + TableScan + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + + Stage: Stage-5 + Spark +#### A masked pattern was here #### + Vertices: + Merge + Map Operator Tree: + TableScan + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + + Stage: Stage-6 + Move Operator + files: + hdfs directory: true +#### A masked pattern was here #### + +PREHOOK: query: insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, 1 as values from inputTbl1 + UNION ALL + SELECT key, 2 as values from inputTbl1 +) a +PREHOOK: type: QUERY +PREHOOK: Input: default@inputtbl1 +PREHOOK: Output: default@outputtbl1 +POSTHOOK: query: insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, 1 as values from inputTbl1 + UNION ALL + SELECT key, 2 as values from inputTbl1 +) a +POSTHOOK: type: QUERY +POSTHOOK: Input: default@inputtbl1 +POSTHOOK: Output: default@outputtbl1 +POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.values EXPRESSION [(inputtbl1)inputtbl1.null, ] +PREHOOK: query: desc formatted outputTbl1 +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@outputtbl1 +POSTHOOK: query: desc formatted outputTbl1 +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@outputtbl1 +# col_name data_type comment + +key string +values bigint + +# Detailed Table Information +Database: default +#### A masked pattern was here #### +Protect Mode: None +Retention: 0 +#### A masked pattern was here #### +Table Type: MANAGED_TABLE +Table Parameters: + COLUMN_STATS_ACCURATE false + numFiles 3 + numRows -1 + rawDataSize -1 + totalSize 68 +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +InputFormat: org.apache.hadoop.mapred.TextInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: select * from outputTbl1 order by key, values +PREHOOK: type: QUERY +PREHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +POSTHOOK: query: select * from outputTbl1 order by key, values +POSTHOOK: type: QUERY +POSTHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +1 1 +1 1 +1 2 +2 1 +2 1 +2 2 +3 1 +3 1 +3 2 +7 1 +7 1 +7 2 +8 1 +8 1 +8 2 +8 2 +8 2 diff --git ql/src/test/results/clientpositive/spark/union_remove_6.q.out ql/src/test/results/clientpositive/spark/union_remove_6.q.out new file mode 100644 index 0000000..1bc55f4 --- /dev/null +++ ql/src/test/results/clientpositive/spark/union_remove_6.q.out @@ -0,0 +1,269 @@ +PREHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 2 subqueries is performed (all of which are mapred queries) +-- followed by select star and a file sink in 2 output tables. +-- The optimiaztion does not take affect since it is a multi-table insert. +-- It does not matter, whether the output is merged or not. In this case, +-- merging is turned off + +create table inputTbl1(key string, val string) stored as textfile +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@inputTbl1 +POSTHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 2 subqueries is performed (all of which are mapred queries) +-- followed by select star and a file sink in 2 output tables. +-- The optimiaztion does not take affect since it is a multi-table insert. +-- It does not matter, whether the output is merged or not. In this case, +-- merging is turned off + +create table inputTbl1(key string, val string) stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@inputTbl1 +PREHOOK: query: create table outputTbl1(key string, values bigint) stored as textfile +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@outputTbl1 +POSTHOOK: query: create table outputTbl1(key string, values bigint) stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@outputTbl1 +PREHOOK: query: create table outputTbl2(key string, values bigint) stored as textfile +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@outputTbl2 +POSTHOOK: query: create table outputTbl2(key string, values bigint) stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@outputTbl2 +PREHOOK: query: load data local inpath '../../data/files/T1.txt' into table inputTbl1 +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@inputtbl1 +POSTHOOK: query: load data local inpath '../../data/files/T1.txt' into table inputTbl1 +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@inputtbl1 +PREHOOK: query: explain +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values from inputTbl1 group by key +) a +insert overwrite table outputTbl1 select * +insert overwrite table outputTbl2 select * +PREHOOK: type: QUERY +POSTHOOK: query: explain +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values from inputTbl1 group by key +) a +insert overwrite table outputTbl1 select * +insert overwrite table outputTbl2 select * +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-2 is a root stage + Stage-3 depends on stages: Stage-2 + Stage-0 depends on stages: Stage-3 + Stage-1 depends on stages: Stage-3 + +STAGE PLANS: + Stage: Stage-2 + Spark + Edges: + Reducer 2 <- Map 1 (GROUP) + Reducer 5 <- Map 4 (GROUP) + Union 3 <- Reducer 2 (NONE), Reducer 5 (NONE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: inputtbl1 + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Select Operator + expressions: key (type: string) + outputColumnNames: key + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Group By Operator + aggregations: count(1) + keys: key (type: string) + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + value expressions: _col1 (type: bigint) + Map 4 + Map Operator Tree: + TableScan + alias: inputtbl1 + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Select Operator + expressions: key (type: string) + outputColumnNames: key + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Group By Operator + aggregations: count(1) + keys: key (type: string) + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + value expressions: _col1 (type: bigint) + Reducer 2 + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + keys: KEY._col0 (type: string) + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: _col0 (type: string), _col1 (type: bigint) + outputColumnNames: _col0, _col1 + Select Operator + expressions: _col0 (type: string), _col1 (type: bigint) + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + Select Operator + expressions: _col0 (type: string), _col1 (type: bigint) + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl2 + Reducer 5 + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + keys: KEY._col0 (type: string) + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: _col0 (type: string), _col1 (type: bigint) + outputColumnNames: _col0, _col1 + Select Operator + expressions: _col0 (type: string), _col1 (type: bigint) + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + Select Operator + expressions: _col0 (type: string), _col1 (type: bigint) + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl2 + Union 3 + Vertex: Union 3 + + Stage: Stage-3 + Dependency Collection + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl1 + + Stage: Stage-1 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.outputtbl2 + +PREHOOK: query: FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values from inputTbl1 group by key +) a +insert overwrite table outputTbl1 select * +insert overwrite table outputTbl2 select * +PREHOOK: type: QUERY +PREHOOK: Input: default@inputtbl1 +PREHOOK: Output: default@outputtbl1 +PREHOOK: Output: default@outputtbl2 +POSTHOOK: query: FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values from inputTbl1 group by key +) a +insert overwrite table outputTbl1 select * +insert overwrite table outputTbl2 select * +POSTHOOK: type: QUERY +POSTHOOK: Input: default@inputtbl1 +POSTHOOK: Output: default@outputtbl1 +POSTHOOK: Output: default@outputtbl2 +POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +POSTHOOK: Lineage: outputtbl2.key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl2.values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +PREHOOK: query: select * from outputTbl1 order by key, values +PREHOOK: type: QUERY +PREHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +POSTHOOK: query: select * from outputTbl1 order by key, values +POSTHOOK: type: QUERY +POSTHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +1 1 +1 1 +2 1 +2 1 +3 1 +3 1 +7 1 +7 1 +8 2 +8 2 +PREHOOK: query: select * from outputTbl2 order by key, values +PREHOOK: type: QUERY +PREHOOK: Input: default@outputtbl2 +#### A masked pattern was here #### +POSTHOOK: query: select * from outputTbl2 order by key, values +POSTHOOK: type: QUERY +POSTHOOK: Input: default@outputtbl2 +#### A masked pattern was here #### +1 1 +1 1 +2 1 +2 1 +3 1 +3 1 +7 1 +7 1 +8 2 +8 2 diff --git ql/src/test/results/clientpositive/spark/union_remove_7.q.out ql/src/test/results/clientpositive/spark/union_remove_7.q.out new file mode 100644 index 0000000..b3ec85b --- /dev/null +++ ql/src/test/results/clientpositive/spark/union_remove_7.q.out @@ -0,0 +1,251 @@ +PREHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 2 map-reduce subqueries is performed followed by select star and a file sink +-- There is no need to write the temporary results of the sub-queries, and then read them +-- again to process the union. The union can be removed completely. +-- The final file format is different from the input and intermediate file format. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- off + +-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23) +-- Since this test creates sub-directories for the output table outputTbl1, it might be easier +-- to run the test only on hadoop 23 + +create table inputTbl1(key string, val string) stored as textfile +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@inputTbl1 +POSTHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 2 map-reduce subqueries is performed followed by select star and a file sink +-- There is no need to write the temporary results of the sub-queries, and then read them +-- again to process the union. The union can be removed completely. +-- The final file format is different from the input and intermediate file format. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- off + +-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23) +-- Since this test creates sub-directories for the output table outputTbl1, it might be easier +-- to run the test only on hadoop 23 + +create table inputTbl1(key string, val string) stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@inputTbl1 +PREHOOK: query: create table outputTbl1(key string, values bigint) stored as rcfile +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@outputTbl1 +POSTHOOK: query: create table outputTbl1(key string, values bigint) stored as rcfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@outputTbl1 +PREHOOK: query: load data local inpath '../../data/files/T1.txt' into table inputTbl1 +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@inputtbl1 +POSTHOOK: query: load data local inpath '../../data/files/T1.txt' into table inputTbl1 +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@inputtbl1 +PREHOOK: query: explain +insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values from inputTbl1 group by key +) a +PREHOOK: type: QUERY +POSTHOOK: query: explain +insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values from inputTbl1 group by key +) a +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + +STAGE PLANS: + Stage: Stage-1 + Spark + Edges: + Reducer 2 <- Map 1 (GROUP) + Reducer 4 <- Map 3 (GROUP) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: inputtbl1 + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Select Operator + expressions: key (type: string) + outputColumnNames: key + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Group By Operator + aggregations: count(1) + keys: key (type: string) + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + value expressions: _col1 (type: bigint) + Map 3 + Map Operator Tree: + TableScan + alias: inputtbl1 + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Select Operator + expressions: key (type: string) + outputColumnNames: key + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Group By Operator + aggregations: count(1) + keys: key (type: string) + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + value expressions: _col1 (type: bigint) + Reducer 2 + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + keys: KEY._col0 (type: string) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col1 (type: bigint) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.outputtbl1 + Reducer 4 + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + keys: KEY._col0 (type: string) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col1 (type: bigint) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.outputtbl1 + + Stage: Stage-2 + Dependency Collection + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.outputtbl1 + +PREHOOK: query: insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values from inputTbl1 group by key +) a +PREHOOK: type: QUERY +PREHOOK: Input: default@inputtbl1 +PREHOOK: Output: default@outputtbl1 +POSTHOOK: query: insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, count(1) as values from inputTbl1 group by key +) a +POSTHOOK: type: QUERY +POSTHOOK: Input: default@inputtbl1 +POSTHOOK: Output: default@outputtbl1 +POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.values EXPRESSION [(inputtbl1)inputtbl1.null, (inputtbl1)inputtbl1.null, ] +PREHOOK: query: desc formatted outputTbl1 +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@outputtbl1 +POSTHOOK: query: desc formatted outputTbl1 +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@outputtbl1 +# col_name data_type comment + +key string +values bigint + +# Detailed Table Information +Database: default +#### A masked pattern was here #### +Protect Mode: None +Retention: 0 +#### A masked pattern was here #### +Table Type: MANAGED_TABLE +Table Parameters: + COLUMN_STATS_ACCURATE false + numFiles 2 + numRows -1 + rawDataSize -1 + totalSize 178 +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe +InputFormat: org.apache.hadoop.hive.ql.io.RCFileInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.RCFileOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: select * from outputTbl1 order by key, values +PREHOOK: type: QUERY +PREHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +POSTHOOK: query: select * from outputTbl1 order by key, values +POSTHOOK: type: QUERY +POSTHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +1 1 +1 1 +2 1 +2 1 +3 1 +3 1 +7 1 +7 1 +8 2 +8 2 diff --git ql/src/test/results/clientpositive/spark/union_remove_8.q.out ql/src/test/results/clientpositive/spark/union_remove_8.q.out new file mode 100644 index 0000000..d55b60d --- /dev/null +++ ql/src/test/results/clientpositive/spark/union_remove_8.q.out @@ -0,0 +1,260 @@ +PREHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 3 subqueries is performed (exactly one of which requires a map-reduce job) +-- followed by select star and a file sink. +-- There is no need to write the temporary results of the sub-queries, and then read them +-- again to process the union. The union can be removed completely. +-- The final file format is different from the input and intermediate file format. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- off + +-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23) +-- Since this test creates sub-directories for the output table outputTbl1, it might be easier +-- to run the test only on hadoop 23 + +create table inputTbl1(key string, val string) stored as textfile +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@inputTbl1 +POSTHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 3 subqueries is performed (exactly one of which requires a map-reduce job) +-- followed by select star and a file sink. +-- There is no need to write the temporary results of the sub-queries, and then read them +-- again to process the union. The union can be removed completely. +-- The final file format is different from the input and intermediate file format. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- off + +-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23) +-- Since this test creates sub-directories for the output table outputTbl1, it might be easier +-- to run the test only on hadoop 23 + +create table inputTbl1(key string, val string) stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@inputTbl1 +PREHOOK: query: create table outputTbl1(key string, values bigint) stored as rcfile +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@outputTbl1 +POSTHOOK: query: create table outputTbl1(key string, values bigint) stored as rcfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@outputTbl1 +PREHOOK: query: load data local inpath '../../data/files/T1.txt' into table inputTbl1 +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@inputtbl1 +POSTHOOK: query: load data local inpath '../../data/files/T1.txt' into table inputTbl1 +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@inputtbl1 +PREHOOK: query: explain +insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, 1 as values from inputTbl1 + UNION ALL + SELECT key, 2 as values from inputTbl1 +) a +PREHOOK: type: QUERY +POSTHOOK: query: explain +insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, 1 as values from inputTbl1 + UNION ALL + SELECT key, 2 as values from inputTbl1 +) a +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + +STAGE PLANS: + Stage: Stage-1 + Spark + Edges: + Reducer 3 <- Map 2 (GROUP) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: inputtbl1 + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Select Operator + expressions: key (type: string), UDFToLong(2) (type: bigint) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.outputtbl1 + Map 2 + Map Operator Tree: + TableScan + alias: inputtbl1 + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Select Operator + expressions: key (type: string) + outputColumnNames: key + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Group By Operator + aggregations: count(1) + keys: key (type: string) + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + value expressions: _col1 (type: bigint) + Map 4 + Map Operator Tree: + TableScan + alias: inputtbl1 + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Select Operator + expressions: key (type: string), UDFToLong(1) (type: bigint) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.outputtbl1 + Reducer 3 + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + keys: KEY._col0 (type: string) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col1 (type: bigint) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.outputtbl1 + + Stage: Stage-2 + Dependency Collection + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.outputtbl1 + +PREHOOK: query: insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, 1 as values from inputTbl1 + UNION ALL + SELECT key, 2 as values from inputTbl1 +) a +PREHOOK: type: QUERY +PREHOOK: Input: default@inputtbl1 +PREHOOK: Output: default@outputtbl1 +POSTHOOK: query: insert overwrite table outputTbl1 +SELECT * +FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, 1 as values from inputTbl1 + UNION ALL + SELECT key, 2 as values from inputTbl1 +) a +POSTHOOK: type: QUERY +POSTHOOK: Input: default@inputtbl1 +POSTHOOK: Output: default@outputtbl1 +POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.values EXPRESSION [(inputtbl1)inputtbl1.null, ] +PREHOOK: query: desc formatted outputTbl1 +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@outputtbl1 +POSTHOOK: query: desc formatted outputTbl1 +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@outputtbl1 +# col_name data_type comment + +key string +values bigint + +# Detailed Table Information +Database: default +#### A masked pattern was here #### +Protect Mode: None +Retention: 0 +#### A masked pattern was here #### +Table Type: MANAGED_TABLE +Table Parameters: + COLUMN_STATS_ACCURATE false + numFiles 3 + numRows -1 + rawDataSize -1 + totalSize 271 +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe +InputFormat: org.apache.hadoop.hive.ql.io.RCFileInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.RCFileOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: select * from outputTbl1 order by key, values +PREHOOK: type: QUERY +PREHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +POSTHOOK: query: select * from outputTbl1 order by key, values +POSTHOOK: type: QUERY +POSTHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +1 1 +1 1 +1 2 +2 1 +2 1 +2 2 +3 1 +3 1 +3 2 +7 1 +7 1 +7 2 +8 1 +8 1 +8 2 +8 2 +8 2 -- 1.8.5.2 (Apple Git-48)