From 5cb021c8f7a359aa80a359bb488f25365171a37e Mon Sep 17 00:00:00 2001 From: Na Yang Date: Fri, 22 Aug 2014 17:38:02 -0700 Subject: [PATCH] HIVE-7810: Insert overwrite table query has strange behavior when set hive.optimize.union.remove=true [Spark Branch] --- .../java/org/apache/hadoop/hive/conf/HiveConf.java | 1 + .../test/resources/testconfiguration.properties | 2 + .../hadoop/hive/ql/optimizer/GenMapRedUtils.java | 3 + .../clientpositive/spark/union_remove_10.q.out | 273 +++++++++++++++++++++ .../clientpositive/spark/union_remove_16.q.out | 32 +-- .../clientpositive/spark/union_remove_4.q.out | 52 +--- .../clientpositive/spark/union_remove_5.q.out | 52 +--- .../clientpositive/spark/union_remove_9.q.out | 269 ++++++++++++++++++++ 8 files changed, 551 insertions(+), 133 deletions(-) create mode 100644 ql/src/test/results/clientpositive/spark/union_remove_10.q.out create mode 100644 ql/src/test/results/clientpositive/spark/union_remove_9.q.out diff --git common/src/java/org/apache/hadoop/hive/conf/HiveConf.java common/src/java/org/apache/hadoop/hive/conf/HiveConf.java index 31d45fa..ceb8721 100644 --- common/src/java/org/apache/hadoop/hive/conf/HiveConf.java +++ common/src/java/org/apache/hadoop/hive/conf/HiveConf.java @@ -784,6 +784,7 @@ HIVEMERGEMAPREDFILES("hive.merge.mapredfiles", false, "Merge small files at the end of a map-reduce job"), HIVEMERGETEZFILES("hive.merge.tezfiles", false, "Merge small files at the end of a Tez DAG"), + HIVEMERGESPARKFILES("hive.merge.sparkfiles", false, "Merge small files at the end of a Spark DAG Transformation"), HIVEMERGEMAPFILESSIZE("hive.merge.size.per.task", (long) (256 * 1000 * 1000), "Size of merged files at the end of the job"), HIVEMERGEMAPFILESAVGSIZE("hive.merge.smallfiles.avgsize", (long) (16 * 1000 * 1000), diff --git itests/src/test/resources/testconfiguration.properties itests/src/test/resources/testconfiguration.properties index de4a36e..e134e39 100644 --- itests/src/test/resources/testconfiguration.properties +++ itests/src/test/resources/testconfiguration.properties @@ -344,6 +344,7 @@ spark.query.files=spark_test.q \ timestamp_null.q \ timestamp_udf.q \ union_remove_1.q \ + union_remove_10.q \ union_remove_11.q \ union_remove_15.q \ union_remove_16.q \ @@ -361,6 +362,7 @@ spark.query.files=spark_test.q \ union_remove_6.q \ union_remove_7.q \ union_remove_8.q \ + union_remove_9.q \ union_null.q \ union_ppr.q \ union.q \ diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMapRedUtils.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMapRedUtils.java index d9c546a..9c808d4 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMapRedUtils.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMapRedUtils.java @@ -1671,6 +1671,9 @@ public static boolean isMergeRequired(List> mvTasks, HiveConf hco // tez blurs the boundary between map and reduce, thus it has it's own // config return hconf.getBoolVar(ConfVars.HIVEMERGETEZFILES); + } else if (currTask.getWork() instanceof SparkWork) { + // spark has its own config for merging + return hconf.getBoolVar(ConfVars.HIVEMERGESPARKFILES); } if (fsOp.getConf().isLinkedFileSink()) { diff --git ql/src/test/results/clientpositive/spark/union_remove_10.q.out ql/src/test/results/clientpositive/spark/union_remove_10.q.out new file mode 100644 index 0000000..927a15d --- /dev/null +++ ql/src/test/results/clientpositive/spark/union_remove_10.q.out @@ -0,0 +1,273 @@ +PREHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 2 subqueries is performed (one of which is a map-only query, and the +-- other one contains a nested union where one of the sub-queries requires a map-reduce +-- job), followed by select star and a file sink. +-- There is no need to write the temporary results of the sub-queries, and then read them +-- again to process the union. The outer union can be removed completely. +-- The final file format is different from the input and intermediate file format. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- on + +-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23) +-- Since this test creates sub-directories for the output table outputTbl1, it might be easier +-- to run the test only on hadoop 23 + +create table inputTbl1(key string, val string) stored as textfile +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@inputTbl1 +POSTHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 2 subqueries is performed (one of which is a map-only query, and the +-- other one contains a nested union where one of the sub-queries requires a map-reduce +-- job), followed by select star and a file sink. +-- There is no need to write the temporary results of the sub-queries, and then read them +-- again to process the union. The outer union can be removed completely. +-- The final file format is different from the input and intermediate file format. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- on + +-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23) +-- Since this test creates sub-directories for the output table outputTbl1, it might be easier +-- to run the test only on hadoop 23 + +create table inputTbl1(key string, val string) stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@inputTbl1 +PREHOOK: query: create table outputTbl1(key string, values bigint) stored as rcfile +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@outputTbl1 +POSTHOOK: query: create table outputTbl1(key string, values bigint) stored as rcfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@outputTbl1 +PREHOOK: query: load data local inpath '../../data/files/T1.txt' into table inputTbl1 +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@inputtbl1 +POSTHOOK: query: load data local inpath '../../data/files/T1.txt' into table inputTbl1 +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@inputtbl1 +PREHOOK: query: explain +insert overwrite table outputTbl1 +SELECT * FROM +( +select key, 1 as values from inputTbl1 +union all +select * FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, 2 as values from inputTbl1 +) a +)b +PREHOOK: type: QUERY +POSTHOOK: query: explain +insert overwrite table outputTbl1 +SELECT * FROM +( +select key, 1 as values from inputTbl1 +union all +select * FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, 2 as values from inputTbl1 +) a +)b +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + +STAGE PLANS: + Stage: Stage-1 + Spark + Edges: + Reducer 2 <- Map 1 (GROUP) + Union 3 <- Map 5 (NONE), Reducer 2 (NONE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: inputtbl1 + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Select Operator + expressions: key (type: string) + outputColumnNames: key + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Group By Operator + aggregations: count(1) + keys: key (type: string) + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + value expressions: _col1 (type: bigint) + Map 4 + Map Operator Tree: + TableScan + alias: inputtbl1 + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Select Operator + expressions: key (type: string), UDFToLong(1) (type: bigint) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.outputtbl1 + Map 5 + Map Operator Tree: + TableScan + alias: inputtbl1 + Select Operator + expressions: key (type: string), UDFToLong(2) (type: bigint) + outputColumnNames: _col0, _col1 + Select Operator + expressions: _col0 (type: string), _col1 (type: bigint) + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.outputtbl1 + Reducer 2 + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + keys: KEY._col0 (type: string) + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: _col0 (type: string), _col1 (type: bigint) + outputColumnNames: _col0, _col1 + Select Operator + expressions: _col0 (type: string), _col1 (type: bigint) + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.outputtbl1 + Union 3 + Vertex: Union 3 + + Stage: Stage-2 + Dependency Collection + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.outputtbl1 + +PREHOOK: query: insert overwrite table outputTbl1 +SELECT * FROM +( +select key, 1 as values from inputTbl1 +union all +select * FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, 2 as values from inputTbl1 +) a +)b +PREHOOK: type: QUERY +PREHOOK: Input: default@inputtbl1 +PREHOOK: Output: default@outputtbl1 +POSTHOOK: query: insert overwrite table outputTbl1 +SELECT * FROM +( +select key, 1 as values from inputTbl1 +union all +select * FROM ( + SELECT key, count(1) as values from inputTbl1 group by key + UNION ALL + SELECT key, 2 as values from inputTbl1 +) a +)b +POSTHOOK: type: QUERY +POSTHOOK: Input: default@inputtbl1 +POSTHOOK: Output: default@outputtbl1 +POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.values EXPRESSION [(inputtbl1)inputtbl1.null, ] +PREHOOK: query: desc formatted outputTbl1 +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@outputtbl1 +POSTHOOK: query: desc formatted outputTbl1 +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@outputtbl1 +# col_name data_type comment + +key string +values bigint + +# Detailed Table Information +Database: default +#### A masked pattern was here #### +Protect Mode: None +Retention: 0 +#### A masked pattern was here #### +Table Type: MANAGED_TABLE +Table Parameters: + COLUMN_STATS_ACCURATE false + numFiles 3 + numRows -1 + rawDataSize -1 + totalSize 271 +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe +InputFormat: org.apache.hadoop.hive.ql.io.RCFileInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.RCFileOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: select * from outputTbl1 order by key, values +PREHOOK: type: QUERY +PREHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +POSTHOOK: query: select * from outputTbl1 order by key, values +POSTHOOK: type: QUERY +POSTHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +1 1 +1 1 +1 2 +2 1 +2 1 +2 2 +3 1 +3 1 +3 2 +7 1 +7 1 +7 2 +8 1 +8 1 +8 2 +8 2 +8 2 diff --git ql/src/test/results/clientpositive/spark/union_remove_16.q.out ql/src/test/results/clientpositive/spark/union_remove_16.q.out index bacb52f..0954ae4 100644 --- ql/src/test/results/clientpositive/spark/union_remove_16.q.out +++ ql/src/test/results/clientpositive/spark/union_remove_16.q.out @@ -66,13 +66,8 @@ FROM ( POSTHOOK: type: QUERY STAGE DEPENDENCIES: Stage-1 is a root stage - Stage-7 depends on stages: Stage-1 , consists of Stage-4, Stage-3, Stage-5 - Stage-4 - Stage-2 depends on stages: Stage-4, Stage-3, Stage-6 + Stage-2 depends on stages: Stage-1 Stage-0 depends on stages: Stage-2 - Stage-3 - Stage-5 - Stage-6 depends on stages: Stage-5 STAGE PLANS: Stage: Stage-1 @@ -165,15 +160,6 @@ STAGE PLANS: serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe name: default.outputtbl1 - Stage: Stage-7 - Conditional Operator - - Stage: Stage-4 - Move Operator - files: - hdfs directory: true -#### A masked pattern was here #### - Stage: Stage-2 Dependency Collection @@ -189,22 +175,6 @@ STAGE PLANS: serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe name: default.outputtbl1 - Stage: Stage-3 - Merge Work - merge level: block - input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat - - Stage: Stage-5 - Merge Work - merge level: block - input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat - - Stage: Stage-6 - Move Operator - files: - hdfs directory: true -#### A masked pattern was here #### - PREHOOK: query: insert overwrite table outputTbl1 partition (ds) SELECT * FROM ( diff --git ql/src/test/results/clientpositive/spark/union_remove_4.q.out ql/src/test/results/clientpositive/spark/union_remove_4.q.out index a429f3e..cc46dda 100644 --- ql/src/test/results/clientpositive/spark/union_remove_4.q.out +++ ql/src/test/results/clientpositive/spark/union_remove_4.q.out @@ -62,13 +62,8 @@ FROM ( POSTHOOK: type: QUERY STAGE DEPENDENCIES: Stage-1 is a root stage - Stage-7 depends on stages: Stage-1 , consists of Stage-4, Stage-3, Stage-5 - Stage-4 - Stage-2 depends on stages: Stage-4, Stage-3, Stage-6 + Stage-2 depends on stages: Stage-1 Stage-0 depends on stages: Stage-2 - Stage-3 - Stage-5 - Stage-6 depends on stages: Stage-5 STAGE PLANS: Stage: Stage-1 @@ -161,15 +156,6 @@ STAGE PLANS: serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe name: default.outputtbl1 - Stage: Stage-7 - Conditional Operator - - Stage: Stage-4 - Move Operator - files: - hdfs directory: true -#### A masked pattern was here #### - Stage: Stage-2 Dependency Collection @@ -183,42 +169,6 @@ STAGE PLANS: serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe name: default.outputtbl1 - Stage: Stage-3 - Spark -#### A masked pattern was here #### - Vertices: - Merge - Map Operator Tree: - TableScan - File Output Operator - compressed: false - table: - input format: org.apache.hadoop.mapred.TextInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.outputtbl1 - - Stage: Stage-5 - Spark -#### A masked pattern was here #### - Vertices: - Merge - Map Operator Tree: - TableScan - File Output Operator - compressed: false - table: - input format: org.apache.hadoop.mapred.TextInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.outputtbl1 - - Stage: Stage-6 - Move Operator - files: - hdfs directory: true -#### A masked pattern was here #### - PREHOOK: query: insert overwrite table outputTbl1 SELECT * FROM ( diff --git ql/src/test/results/clientpositive/spark/union_remove_5.q.out ql/src/test/results/clientpositive/spark/union_remove_5.q.out index c2c7c08..f6cdeb3 100644 --- ql/src/test/results/clientpositive/spark/union_remove_5.q.out +++ ql/src/test/results/clientpositive/spark/union_remove_5.q.out @@ -70,13 +70,8 @@ FROM ( POSTHOOK: type: QUERY STAGE DEPENDENCIES: Stage-1 is a root stage - Stage-7 depends on stages: Stage-1 , consists of Stage-4, Stage-3, Stage-5 - Stage-4 - Stage-2 depends on stages: Stage-4, Stage-3, Stage-6 + Stage-2 depends on stages: Stage-1 Stage-0 depends on stages: Stage-2 - Stage-3 - Stage-5 - Stage-6 depends on stages: Stage-5 STAGE PLANS: Stage: Stage-1 @@ -161,15 +156,6 @@ STAGE PLANS: serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe name: default.outputtbl1 - Stage: Stage-7 - Conditional Operator - - Stage: Stage-4 - Move Operator - files: - hdfs directory: true -#### A masked pattern was here #### - Stage: Stage-2 Dependency Collection @@ -183,42 +169,6 @@ STAGE PLANS: serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe name: default.outputtbl1 - Stage: Stage-3 - Spark -#### A masked pattern was here #### - Vertices: - Merge - Map Operator Tree: - TableScan - File Output Operator - compressed: false - table: - input format: org.apache.hadoop.mapred.TextInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.outputtbl1 - - Stage: Stage-5 - Spark -#### A masked pattern was here #### - Vertices: - Merge - Map Operator Tree: - TableScan - File Output Operator - compressed: false - table: - input format: org.apache.hadoop.mapred.TextInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.outputtbl1 - - Stage: Stage-6 - Move Operator - files: - hdfs directory: true -#### A masked pattern was here #### - PREHOOK: query: insert overwrite table outputTbl1 SELECT * FROM ( diff --git ql/src/test/results/clientpositive/spark/union_remove_9.q.out ql/src/test/results/clientpositive/spark/union_remove_9.q.out new file mode 100644 index 0000000..1f0260c --- /dev/null +++ ql/src/test/results/clientpositive/spark/union_remove_9.q.out @@ -0,0 +1,269 @@ +PREHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 2 subqueries is performed (one of which contains a union and is map-only), +-- and the other one is a map-reduce query followed by select star and a file sink. +-- There is no need for the outer union. +-- The final file format is different from the input and intermediate file format. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- on + +-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23) +-- Since this test creates sub-directories for the output table outputTbl1, it might be easier +-- to run the test only on hadoop 23 + +create table inputTbl1(key string, val string) stored as textfile +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@inputTbl1 +POSTHOOK: query: -- This is to test the union->selectstar->filesink optimization +-- Union of 2 subqueries is performed (one of which contains a union and is map-only), +-- and the other one is a map-reduce query followed by select star and a file sink. +-- There is no need for the outer union. +-- The final file format is different from the input and intermediate file format. +-- It does not matter, whether the output is merged or not. In this case, merging is turned +-- on + +-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23) +-- Since this test creates sub-directories for the output table outputTbl1, it might be easier +-- to run the test only on hadoop 23 + +create table inputTbl1(key string, val string) stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@inputTbl1 +PREHOOK: query: create table outputTbl1(key string, values bigint) stored as rcfile +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@outputTbl1 +POSTHOOK: query: create table outputTbl1(key string, values bigint) stored as rcfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@outputTbl1 +PREHOOK: query: load data local inpath '../../data/files/T1.txt' into table inputTbl1 +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@inputtbl1 +POSTHOOK: query: load data local inpath '../../data/files/T1.txt' into table inputTbl1 +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@inputtbl1 +PREHOOK: query: explain +insert overwrite table outputTbl1 +SELECT * FROM +( +select key, count(1) as values from inputTbl1 group by key +union all +select * FROM ( + SELECT key, 1 as values from inputTbl1 + UNION ALL + SELECT key, 2 as values from inputTbl1 +) a +)b +PREHOOK: type: QUERY +POSTHOOK: query: explain +insert overwrite table outputTbl1 +SELECT * FROM +( +select key, count(1) as values from inputTbl1 group by key +union all +select * FROM ( + SELECT key, 1 as values from inputTbl1 + UNION ALL + SELECT key, 2 as values from inputTbl1 +) a +)b +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + +STAGE PLANS: + Stage: Stage-1 + Spark + Edges: + Reducer 4 <- Map 3 (GROUP) + Union 2 <- Map 1 (NONE), Map 5 (NONE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: inputtbl1 + Select Operator + expressions: key (type: string), 1 (type: int) + outputColumnNames: _col0, _col1 + Select Operator + expressions: _col0 (type: string), UDFToLong(_col1) (type: bigint) + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.outputtbl1 + Map 3 + Map Operator Tree: + TableScan + alias: inputtbl1 + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Select Operator + expressions: key (type: string) + outputColumnNames: key + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Group By Operator + aggregations: count(1) + keys: key (type: string) + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 0 Data size: 30 Basic stats: PARTIAL Column stats: NONE + value expressions: _col1 (type: bigint) + Map 5 + Map Operator Tree: + TableScan + alias: inputtbl1 + Select Operator + expressions: key (type: string), 2 (type: int) + outputColumnNames: _col0, _col1 + Select Operator + expressions: _col0 (type: string), UDFToLong(_col1) (type: bigint) + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.outputtbl1 + Reducer 4 + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + keys: KEY._col0 (type: string) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col1 (type: bigint) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.outputtbl1 + Union 2 + Vertex: Union 2 + + Stage: Stage-2 + Dependency Collection + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat + output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat + serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe + name: default.outputtbl1 + +PREHOOK: query: insert overwrite table outputTbl1 +SELECT * FROM +( +select key, count(1) as values from inputTbl1 group by key +union all +select * FROM ( + SELECT key, 1 as values from inputTbl1 + UNION ALL + SELECT key, 2 as values from inputTbl1 +) a +)b +PREHOOK: type: QUERY +PREHOOK: Input: default@inputtbl1 +PREHOOK: Output: default@outputtbl1 +POSTHOOK: query: insert overwrite table outputTbl1 +SELECT * FROM +( +select key, count(1) as values from inputTbl1 group by key +union all +select * FROM ( + SELECT key, 1 as values from inputTbl1 + UNION ALL + SELECT key, 2 as values from inputTbl1 +) a +)b +POSTHOOK: type: QUERY +POSTHOOK: Input: default@inputtbl1 +POSTHOOK: Output: default@outputtbl1 +POSTHOOK: Lineage: outputtbl1.key EXPRESSION [(inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), (inputtbl1)inputtbl1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: outputtbl1.values EXPRESSION [(inputtbl1)inputtbl1.null, ] +PREHOOK: query: desc formatted outputTbl1 +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@outputtbl1 +POSTHOOK: query: desc formatted outputTbl1 +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@outputtbl1 +# col_name data_type comment + +key string +values bigint + +# Detailed Table Information +Database: default +#### A masked pattern was here #### +Protect Mode: None +Retention: 0 +#### A masked pattern was here #### +Table Type: MANAGED_TABLE +Table Parameters: + COLUMN_STATS_ACCURATE false + numFiles 3 + numRows -1 + rawDataSize -1 + totalSize 271 +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe +InputFormat: org.apache.hadoop.hive.ql.io.RCFileInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.RCFileOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: select * from outputTbl1 order by key, values +PREHOOK: type: QUERY +PREHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +POSTHOOK: query: select * from outputTbl1 order by key, values +POSTHOOK: type: QUERY +POSTHOOK: Input: default@outputtbl1 +#### A masked pattern was here #### +1 1 +1 1 +1 2 +2 1 +2 1 +2 2 +3 1 +3 1 +3 2 +7 1 +7 1 +7 2 +8 1 +8 1 +8 2 +8 2 +8 2 -- 1.8.5.2 (Apple Git-48)