diff --git ql/src/java/org/apache/hadoop/hive/ql/parse/spark/GenSparkProcContext.java ql/src/java/org/apache/hadoop/hive/ql/parse/spark/GenSparkProcContext.java index b0ab495..4d2bcfa 100644 --- ql/src/java/org/apache/hadoop/hive/ql/parse/spark/GenSparkProcContext.java +++ ql/src/java/org/apache/hadoop/hive/ql/parse/spark/GenSparkProcContext.java @@ -18,6 +18,7 @@ package org.apache.hadoop.hive.ql.parse.spark; +import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.common.ObjectPair; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.ql.exec.DependencyCollectionTask; @@ -38,6 +39,7 @@ import org.apache.hadoop.hive.ql.parse.ParseContext; import org.apache.hadoop.hive.ql.plan.BaseWork; import org.apache.hadoop.hive.ql.plan.DependencyCollectionWork; +import org.apache.hadoop.hive.ql.plan.FileSinkDesc; import org.apache.hadoop.hive.ql.plan.MoveWork; import org.apache.hadoop.hive.ql.plan.OperatorDesc; import org.apache.hadoop.hive.ql.plan.ReduceWork; @@ -131,6 +133,8 @@ public final List currentUnionOperators; public final Set workWithUnionOperators; + // we link filesink that will write to the same final location + public final Map> linkedFileSinks; public final Set fileSinkSet; public final Map> fileSinkMap; @@ -180,6 +184,7 @@ public GenSparkProcContext(HiveConf conf, this.unionWorkMap = new LinkedHashMap, BaseWork>(); this.currentUnionOperators = new LinkedList(); this.workWithUnionOperators = new LinkedHashSet(); + this.linkedFileSinks = new LinkedHashMap<>(); this.fileSinkSet = new LinkedHashSet(); this.fileSinkMap = new LinkedHashMap>(); this.pruningSinkSet = new LinkedHashSet>(); diff --git ql/src/java/org/apache/hadoop/hive/ql/parse/spark/GenSparkUtils.java ql/src/java/org/apache/hadoop/hive/ql/parse/spark/GenSparkUtils.java index 08602e2..4f826a1 100644 --- ql/src/java/org/apache/hadoop/hive/ql/parse/spark/GenSparkUtils.java +++ ql/src/java/org/apache/hadoop/hive/ql/parse/spark/GenSparkUtils.java @@ -18,6 +18,7 @@ package org.apache.hadoop.hive.ql.parse.spark; +import java.io.Serializable; import java.util.ArrayList; import java.util.Deque; import java.util.HashMap; @@ -31,6 +32,8 @@ import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.metastore.api.hive_metastoreConstants; +import org.apache.hadoop.hive.ql.Context; +import org.apache.hadoop.hive.ql.exec.DependencyCollectionTask; import org.apache.hadoop.hive.ql.exec.FetchTask; import org.apache.hadoop.hive.ql.exec.FileSinkOperator; import org.apache.hadoop.hive.ql.exec.ForwardOperator; @@ -43,6 +46,7 @@ import org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator; import org.apache.hadoop.hive.ql.exec.SerializationUtilities; import org.apache.hadoop.hive.ql.exec.TableScanOperator; +import org.apache.hadoop.hive.ql.exec.Task; import org.apache.hadoop.hive.ql.exec.UnionOperator; import org.apache.hadoop.hive.ql.exec.spark.SparkUtilities; import org.apache.hadoop.hive.ql.optimizer.GenMapRedUtils; @@ -53,7 +57,9 @@ import org.apache.hadoop.hive.ql.parse.SemanticException; import org.apache.hadoop.hive.ql.plan.BaseWork; import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; +import org.apache.hadoop.hive.ql.plan.FileSinkDesc; import org.apache.hadoop.hive.ql.plan.MapWork; +import org.apache.hadoop.hive.ql.plan.MoveWork; import org.apache.hadoop.hive.ql.plan.OperatorDesc; import org.apache.hadoop.hive.ql.plan.ReduceWork; import org.apache.hadoop.hive.ql.plan.SparkEdgeProperty; @@ -269,6 +275,11 @@ public void removeUnionOperators(GenSparkProcContext context, BaseWork work) Iterator> it = newRoots.iterator(); for (Operator orig: roots) { + Set fsOpSet = OperatorUtils.findOperators(orig, FileSinkOperator.class); + for (FileSinkOperator fsOp : fsOpSet) { + context.fileSinkSet.remove(fsOp); + } + Operator newRoot = it.next(); if (newRoot instanceof HashTableDummyOperator) { dummyOps.add((HashTableDummyOperator) newRoot); @@ -290,6 +301,26 @@ public void removeUnionOperators(GenSparkProcContext context, BaseWork work) Operator current = operators.pop(); seen.add(current); + if (current instanceof FileSinkOperator) { + FileSinkOperator fileSink = (FileSinkOperator)current; + + // remember it for additional processing later + context.fileSinkSet.add(fileSink); + + FileSinkDesc desc = fileSink.getConf(); + Path path = desc.getDirName(); + List linked; + + if (!context.linkedFileSinks.containsKey(path)) { + linked = new ArrayList(); + context.linkedFileSinks.put(path, linked); + } + linked = context.linkedFileSinks.get(path); + linked.add(desc); + + desc.setLinkedFileSinkDesc(linked); + } + if (current instanceof UnionOperator) { Operator parent = null; int count = 0; @@ -344,7 +375,7 @@ public void processFileSink(GenSparkProcContext context, FileSinkOperator fileSi } } - Path finalName = GenMapRedUtils.createMoveTask(context.currentTask, + Path finalName = createMoveTask(context.currentTask, chDir, fileSink, parseContext, context.moveTask, hconf, context.dependencyTask); if (chDir) { @@ -365,6 +396,52 @@ public void processFileSink(GenSparkProcContext context, FileSinkOperator fileSi } /** + * Create and add any dependent move tasks. + * + * This is forked from {@link GenMapRedUtils}. The difference is that it doesn't check + * 'isLinkedFileSink' and does not set parent dir for the linked file sinks. + */ + public static Path createMoveTask(Task currTask, boolean chDir, + FileSinkOperator fsOp, ParseContext parseCtx, List> mvTasks, + HiveConf hconf, DependencyCollectionTask dependencyTask) { + + Path dest = null; + + if (chDir) { + dest = fsOp.getConf().getFinalDirName(); + + // generate the temporary file + // it must be on the same file system as the current destination + Context baseCtx = parseCtx.getContext(); + + Path tmpDir = baseCtx.getExternalTmpPath(dest); + + FileSinkDesc fileSinkDesc = fsOp.getConf(); + // Change all the linked file sink descriptors + if (fileSinkDesc.getLinkedFileSinkDesc() != null) { + for (FileSinkDesc fsConf : fileSinkDesc.getLinkedFileSinkDesc()) { + fsConf.setDirName(tmpDir); + } + } else { + fileSinkDesc.setDirName(tmpDir); + } + } + + Task mvTask = null; + + if (!chDir) { + mvTask = GenMapRedUtils.findMoveTask(mvTasks, fsOp); + } + + // Set the move task to be dependent on the current task + if (mvTask != null) { + GenMapRedUtils.addDependentMoveTasks(mvTask, hconf, currTask, dependencyTask); + } + + return dest; + } + + /** * Populate partition pruning information from the pruning sink operator to the * target MapWork (the MapWork for the big table side). The information include the source table * name, column name, and partition key expression. It also set up the temporary path used to diff --git ql/src/test/queries/clientpositive/spark_union_merge.q ql/src/test/queries/clientpositive/spark_union_merge.q new file mode 100644 index 0000000..3121078 --- /dev/null +++ ql/src/test/queries/clientpositive/spark_union_merge.q @@ -0,0 +1,41 @@ +set hive.mapred.mode=nonstrict; +-- union case: both subqueries are map jobs on same input, followed by filesink +-- mostly copied from union.q + +set hive.merge.sparkfiles=false; + +EXPLAIN EXTENDED +FROM ( + FROM src select src.key, src.value WHERE src.key < 100 + UNION ALL + FROM src SELECT src.* WHERE src.key > 100 +) unioninput +INSERT OVERWRITE DIRECTORY 'target/warehouse/union_merge.out' SELECT unioninput.*; + +FROM ( + FROM src select src.key, src.value WHERE src.key < 100 + UNION ALL + FROM src SELECT src.* WHERE src.key > 100 +) unioninput +INSERT OVERWRITE DIRECTORY 'target/warehouse/union_merge.out' SELECT unioninput.*; + +dfs -ls ${system:test.warehouse.dir}/union_merge.out; + +set hive.merge.sparkfiles=true; + +EXPLAIN EXTENDED +FROM ( + FROM src select src.key, src.value WHERE src.key < 100 + UNION ALL + FROM src SELECT src.* WHERE src.key > 100 +) unioninput +INSERT OVERWRITE DIRECTORY 'target/warehouse/union_merge.out' SELECT unioninput.*; + +FROM ( + FROM src select src.key, src.value WHERE src.key < 100 + UNION ALL + FROM src SELECT src.* WHERE src.key > 100 +) unioninput +INSERT OVERWRITE DIRECTORY 'target/warehouse/union_merge.out' SELECT unioninput.*; + +dfs -ls ${system:test.warehouse.dir}/union_merge.out; diff --git ql/src/test/results/clientpositive/spark/spark_union_merge.q.out ql/src/test/results/clientpositive/spark/spark_union_merge.q.out new file mode 100644 index 0000000..f6afd8b --- /dev/null +++ ql/src/test/results/clientpositive/spark/spark_union_merge.q.out @@ -0,0 +1,564 @@ +PREHOOK: query: EXPLAIN EXTENDED +FROM ( + FROM src select src.key, src.value WHERE src.key < 100 + UNION ALL + FROM src SELECT src.* WHERE src.key > 100 +) unioninput +INSERT OVERWRITE DIRECTORY 'target/warehouse/union_merge.out' SELECT unioninput.* +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN EXTENDED +FROM ( + FROM src select src.key, src.value WHERE src.key < 100 + UNION ALL + FROM src SELECT src.* WHERE src.key > 100 +) unioninput +INSERT OVERWRITE DIRECTORY 'target/warehouse/union_merge.out' SELECT unioninput.* +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Spark +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + GatherStats: false + Filter Operator + isSamplingPred: false + predicate: (UDFToDouble(key) < 100.0) (type: boolean) + Statistics: Num rows: 166 Data size: 1763 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string), value (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 166 Data size: 1763 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + GlobalTableId: 1 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + Statistics: Num rows: 332 Data size: 3526 Basic stats: COMPLETE Column stats: NONE +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1 + columns.types string:string + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: src + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + COLUMN_STATS_ACCURATE {"BASIC_STATS":"true","COLUMN_STATS":{"key":"true","value":"true"}} + bucket_count -1 + column.name.delimiter , + columns key,value + columns.comments 'default','default' + columns.types string:string +#### A masked pattern was here #### + name default.src + numFiles 1 + numRows 500 + rawDataSize 5312 + serialization.ddl struct src { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + COLUMN_STATS_ACCURATE {"BASIC_STATS":"true","COLUMN_STATS":{"key":"true","value":"true"}} + bucket_count -1 + column.name.delimiter , + columns key,value + columns.comments 'default','default' + columns.types string:string +#### A masked pattern was here #### + name default.src + numFiles 1 + numRows 500 + rawDataSize 5312 + serialization.ddl struct src { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.src + name: default.src + Truncated Path -> Alias: + /src [null-subquery1:$hdt$_0-subquery1:src] + Map 2 + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + GatherStats: false + Filter Operator + isSamplingPred: false + predicate: (UDFToDouble(key) > 100.0) (type: boolean) + Statistics: Num rows: 166 Data size: 1763 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string), value (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 166 Data size: 1763 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + GlobalTableId: 1 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + Statistics: Num rows: 332 Data size: 3526 Basic stats: COMPLETE Column stats: NONE +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1 + columns.types string:string + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: src + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + COLUMN_STATS_ACCURATE {"BASIC_STATS":"true","COLUMN_STATS":{"key":"true","value":"true"}} + bucket_count -1 + column.name.delimiter , + columns key,value + columns.comments 'default','default' + columns.types string:string +#### A masked pattern was here #### + name default.src + numFiles 1 + numRows 500 + rawDataSize 5312 + serialization.ddl struct src { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + COLUMN_STATS_ACCURATE {"BASIC_STATS":"true","COLUMN_STATS":{"key":"true","value":"true"}} + bucket_count -1 + column.name.delimiter , + columns key,value + columns.comments 'default','default' + columns.types string:string +#### A masked pattern was here #### + name default.src + numFiles 1 + numRows 500 + rawDataSize 5312 + serialization.ddl struct src { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.src + name: default.src + Truncated Path -> Alias: + /src [null-subquery2:$hdt$_0-subquery2:src] + + Stage: Stage-0 + Move Operator + files: + hdfs directory: true +#### A masked pattern was here #### + destination: target/warehouse/union_merge.out + +PREHOOK: query: FROM ( + FROM src select src.key, src.value WHERE src.key < 100 + UNION ALL + FROM src SELECT src.* WHERE src.key > 100 +) unioninput +INSERT OVERWRITE DIRECTORY 'target/warehouse/union_merge.out' SELECT unioninput.* +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: target/warehouse/union_merge.out +POSTHOOK: query: FROM ( + FROM src select src.key, src.value WHERE src.key < 100 + UNION ALL + FROM src SELECT src.* WHERE src.key > 100 +) unioninput +INSERT OVERWRITE DIRECTORY 'target/warehouse/union_merge.out' SELECT unioninput.* +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: target/warehouse/union_merge.out +Found 2 items +#### A masked pattern was here #### +PREHOOK: query: EXPLAIN EXTENDED +FROM ( + FROM src select src.key, src.value WHERE src.key < 100 + UNION ALL + FROM src SELECT src.* WHERE src.key > 100 +) unioninput +INSERT OVERWRITE DIRECTORY 'target/warehouse/union_merge.out' SELECT unioninput.* +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN EXTENDED +FROM ( + FROM src select src.key, src.value WHERE src.key < 100 + UNION ALL + FROM src SELECT src.* WHERE src.key > 100 +) unioninput +INSERT OVERWRITE DIRECTORY 'target/warehouse/union_merge.out' SELECT unioninput.* +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-6 depends on stages: Stage-1 , consists of Stage-3, Stage-2, Stage-4 + Stage-3 + Stage-0 depends on stages: Stage-3, Stage-2, Stage-5 + Stage-2 + Stage-4 + Stage-5 depends on stages: Stage-4 + +STAGE PLANS: + Stage: Stage-1 + Spark +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + GatherStats: false + Filter Operator + isSamplingPred: false + predicate: (UDFToDouble(key) < 100.0) (type: boolean) + Statistics: Num rows: 166 Data size: 1763 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string), value (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 166 Data size: 1763 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + GlobalTableId: 1 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + Statistics: Num rows: 332 Data size: 3526 Basic stats: COMPLETE Column stats: NONE +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1 + columns.types string:string + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: src + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + COLUMN_STATS_ACCURATE {"BASIC_STATS":"true","COLUMN_STATS":{"key":"true","value":"true"}} + bucket_count -1 + column.name.delimiter , + columns key,value + columns.comments 'default','default' + columns.types string:string +#### A masked pattern was here #### + name default.src + numFiles 1 + numRows 500 + rawDataSize 5312 + serialization.ddl struct src { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + COLUMN_STATS_ACCURATE {"BASIC_STATS":"true","COLUMN_STATS":{"key":"true","value":"true"}} + bucket_count -1 + column.name.delimiter , + columns key,value + columns.comments 'default','default' + columns.types string:string +#### A masked pattern was here #### + name default.src + numFiles 1 + numRows 500 + rawDataSize 5312 + serialization.ddl struct src { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.src + name: default.src + Truncated Path -> Alias: + /src [null-subquery1:$hdt$_0-subquery1:src] + Map 2 + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + GatherStats: false + Filter Operator + isSamplingPred: false + predicate: (UDFToDouble(key) > 100.0) (type: boolean) + Statistics: Num rows: 166 Data size: 1763 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string), value (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 166 Data size: 1763 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + GlobalTableId: 1 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + Statistics: Num rows: 332 Data size: 3526 Basic stats: COMPLETE Column stats: NONE +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1 + columns.types string:string + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: src + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + COLUMN_STATS_ACCURATE {"BASIC_STATS":"true","COLUMN_STATS":{"key":"true","value":"true"}} + bucket_count -1 + column.name.delimiter , + columns key,value + columns.comments 'default','default' + columns.types string:string +#### A masked pattern was here #### + name default.src + numFiles 1 + numRows 500 + rawDataSize 5312 + serialization.ddl struct src { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + COLUMN_STATS_ACCURATE {"BASIC_STATS":"true","COLUMN_STATS":{"key":"true","value":"true"}} + bucket_count -1 + column.name.delimiter , + columns key,value + columns.comments 'default','default' + columns.types string:string +#### A masked pattern was here #### + name default.src + numFiles 1 + numRows 500 + rawDataSize 5312 + serialization.ddl struct src { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.src + name: default.src + Truncated Path -> Alias: + /src [null-subquery2:$hdt$_0-subquery2:src] + + Stage: Stage-6 + Conditional Operator + + Stage: Stage-3 + Move Operator + files: + hdfs directory: true +#### A masked pattern was here #### + + Stage: Stage-0 + Move Operator + files: + hdfs directory: true +#### A masked pattern was here #### + destination: target/warehouse/union_merge.out + + Stage: Stage-2 + Spark +#### A masked pattern was here #### + Vertices: + Spark Merge File Work + Map Operator Tree: + TableScan + GatherStats: false + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1 + columns.types string:string + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: -ext-10002 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1 + columns.types string:string + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1 + columns.types string:string + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Truncated Path -> Alias: +#### A masked pattern was here #### + + Stage: Stage-4 + Spark +#### A masked pattern was here #### + Vertices: + Spark Merge File Work + Map Operator Tree: + TableScan + GatherStats: false + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1 + columns.types string:string + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: -ext-10002 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1 + columns.types string:string + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1 + columns.types string:string + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Truncated Path -> Alias: +#### A masked pattern was here #### + + Stage: Stage-5 + Move Operator + files: + hdfs directory: true +#### A masked pattern was here #### + +PREHOOK: query: FROM ( + FROM src select src.key, src.value WHERE src.key < 100 + UNION ALL + FROM src SELECT src.* WHERE src.key > 100 +) unioninput +INSERT OVERWRITE DIRECTORY 'target/warehouse/union_merge.out' SELECT unioninput.* +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: target/warehouse/union_merge.out +POSTHOOK: query: FROM ( + FROM src select src.key, src.value WHERE src.key < 100 + UNION ALL + FROM src SELECT src.* WHERE src.key > 100 +) unioninput +INSERT OVERWRITE DIRECTORY 'target/warehouse/union_merge.out' SELECT unioninput.* +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: target/warehouse/union_merge.out +Found 1 items +#### A masked pattern was here ####