diff --git itests/src/test/resources/testconfiguration.properties itests/src/test/resources/testconfiguration.properties index 42bc5df0a9..784cd1764a 100644 --- itests/src/test/resources/testconfiguration.properties +++ itests/src/test/resources/testconfiguration.properties @@ -448,6 +448,7 @@ minillap.query.files=acid_bucket_pruning.q,\ temp_table_drop_partitions_filter4.q minillaplocal.query.files=\ + empty_files_external_table.q,\ bucket_num_reducers_acid.q,\ dec_str.q,\ dp_counter_non_mm.q,\ diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/FileSinkOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/FileSinkOperator.java index d0f452b0c7..1bb52b0348 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/FileSinkOperator.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/FileSinkOperator.java @@ -1336,7 +1336,8 @@ public void closeOp(boolean abort) throws HiveException { Class clazz = conf.getTableInfo().getOutputFileFormatClass(); boolean isStreaming = StreamingOutputFormat.class.isAssignableFrom(clazz); - if (!isTez || isStreaming || this.isInsertOverwrite) { + // let empty file generation for mm/acid table as a quick and dirty workaround for HIVE-22941 + if (!isTez || isStreaming || (this.isInsertOverwrite && (conf.isMmTable() || conf.isFullAcidTable()))) { createBucketFiles(fsp); } } diff --git ql/src/test/queries/clientpositive/empty_files_external_table.q ql/src/test/queries/clientpositive/empty_files_external_table.q new file mode 100644 index 0000000000..873789540c --- /dev/null +++ ql/src/test/queries/clientpositive/empty_files_external_table.q @@ -0,0 +1,10 @@ +create external table t1(age int, name string, id int) stored as orc; +create external table t6 like t1; +create external table t5 like t1; + +describe formatted t1; +explain insert overwrite table t1 select a.* from t5 a full outer join t6 b on a.id=b.id and a.name=b.name and a.age=b.age; + +-- this below is not supposed to put an empty file into the external table (instead only clear its contents) +insert overwrite table t1 select a.* from t5 a full outer join t6 b on a.id=b.id and a.name=b.name and a.age=b.age; +dfs -ls ${hiveconf:hive.metastore.warehouse.dir}/t1/; \ No newline at end of file diff --git ql/src/test/results/clientpositive/llap/empty_files_external_table.q.out ql/src/test/results/clientpositive/llap/empty_files_external_table.q.out new file mode 100644 index 0000000000..1491203caf --- /dev/null +++ ql/src/test/results/clientpositive/llap/empty_files_external_table.q.out @@ -0,0 +1,204 @@ +PREHOOK: query: create external table t1(age int, name string, id int) stored as orc +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@t1 +POSTHOOK: query: create external table t1(age int, name string, id int) stored as orc +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@t1 +PREHOOK: query: create external table t6 like t1 +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@t6 +POSTHOOK: query: create external table t6 like t1 +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@t6 +PREHOOK: query: create external table t5 like t1 +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@t5 +POSTHOOK: query: create external table t5 like t1 +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@t5 +PREHOOK: query: describe formatted t1 +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@t1 +POSTHOOK: query: describe formatted t1 +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@t1 +# col_name data_type comment +age int +name string +id int + +# Detailed Table Information +Database: default +#### A masked pattern was here #### +Retention: 0 +#### A masked pattern was here #### +Table Type: EXTERNAL_TABLE +Table Parameters: + COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"age\":\"true\",\"id\":\"true\",\"name\":\"true\"}} + EXTERNAL TRUE + bucketing_version 2 + numFiles 0 + numRows 0 + rawDataSize 0 + totalSize 0 +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.ql.io.orc.OrcSerde +InputFormat: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: explain insert overwrite table t1 select a.* from t5 a full outer join t6 b on a.id=b.id and a.name=b.name and a.age=b.age +PREHOOK: type: QUERY +PREHOOK: Input: default@t5 +PREHOOK: Input: default@t6 +PREHOOK: Output: default@t1 +POSTHOOK: query: explain insert overwrite table t1 select a.* from t5 a full outer join t6 b on a.id=b.id and a.name=b.name and a.age=b.age +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t5 +POSTHOOK: Input: default@t6 +POSTHOOK: Output: default@t1 +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + Stage-3 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 4 (SIMPLE_EDGE) + Reducer 3 <- Reducer 2 (CUSTOM_SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: a + Statistics: Num rows: 1 Data size: 192 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: age (type: int), name (type: string), id (type: int) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 192 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: int), _col1 (type: string), _col2 (type: int) + null sort order: zzz + sort order: +++ + Map-reduce partition columns: _col0 (type: int), _col1 (type: string), _col2 (type: int) + Statistics: Num rows: 1 Data size: 192 Basic stats: COMPLETE Column stats: NONE + Execution mode: vectorized, llap + LLAP IO: all inputs + Map 4 + Map Operator Tree: + TableScan + alias: b + Statistics: Num rows: 1 Data size: 192 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: age (type: int), name (type: string), id (type: int) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 192 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: int), _col1 (type: string), _col2 (type: int) + null sort order: zzz + sort order: +++ + Map-reduce partition columns: _col0 (type: int), _col1 (type: string), _col2 (type: int) + Statistics: Num rows: 1 Data size: 192 Basic stats: COMPLETE Column stats: NONE + Execution mode: vectorized, llap + LLAP IO: all inputs + Reducer 2 + Execution mode: llap + Reduce Operator Tree: + Merge Join Operator + condition map: + Full Outer Join 0 to 1 + keys: + 0 _col0 (type: int), _col1 (type: string), _col2 (type: int) + 1 _col0 (type: int), _col1 (type: string), _col2 (type: int) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 211 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 211 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + name: default.t1 + Select Operator + expressions: _col0 (type: int), _col1 (type: string), _col2 (type: int) + outputColumnNames: age, name, id + Statistics: Num rows: 1 Data size: 211 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: compute_stats(age, 'hll'), compute_stats(name, 'hll'), compute_stats(id, 'hll') + minReductionHashAggr: 0.99 + mode: hash + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 1288 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + null sort order: + sort order: + Statistics: Num rows: 1 Data size: 1288 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: struct), _col1 (type: struct), _col2 (type: struct) + Reducer 3 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: compute_stats(VALUE._col0), compute_stats(VALUE._col1), compute_stats(VALUE._col2) + mode: mergepartial + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 1320 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 1320 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-2 + Dependency Collection + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + name: default.t1 + + Stage: Stage-3 + Stats Work + Basic Stats Work: + Column Stats Desc: + Columns: age, name, id + Column Types: int, string, int + Table: default.t1 + +PREHOOK: query: insert overwrite table t1 select a.* from t5 a full outer join t6 b on a.id=b.id and a.name=b.name and a.age=b.age +PREHOOK: type: QUERY +PREHOOK: Input: default@t5 +PREHOOK: Input: default@t6 +PREHOOK: Output: default@t1 +POSTHOOK: query: insert overwrite table t1 select a.* from t5 a full outer join t6 b on a.id=b.id and a.name=b.name and a.age=b.age +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t5 +POSTHOOK: Input: default@t6 +POSTHOOK: Output: default@t1 +POSTHOOK: Lineage: t1.age SIMPLE [(t5)a.FieldSchema(name:age, type:int, comment:null), ] +POSTHOOK: Lineage: t1.id SIMPLE [(t5)a.FieldSchema(name:id, type:int, comment:null), ] +POSTHOOK: Lineage: t1.name SIMPLE [(t5)a.FieldSchema(name:name, type:string, comment:null), ]