diff --git itests/qtest/testconfiguration.properties itests/qtest/testconfiguration.properties index 385397d..cf80376 100644 --- itests/qtest/testconfiguration.properties +++ itests/qtest/testconfiguration.properties @@ -1,5 +1,5 @@ minimr.query.files=stats_counter_partitioned.q,list_bucket_dml_10.q,input16_cc.q,scriptfile1.q,scriptfile1_win.q,bucket4.q,bucketmapjoin6.q,disable_merge_for_bucketing.q,reduce_deduplicate.q,smb_mapjoin_8.q,join1.q,groupby2.q,bucketizedhiveinputformat.q,bucketmapjoin7.q,optrstat_groupby.q,bucket_num_reducers.q,bucket5.q,bucket6.q,load_fs2.q,bucket_num_reducers2.q,infer_bucket_sort_merge.q,infer_bucket_sort_reducers_power_two.q,infer_bucket_sort_dyn_part.q,infer_bucket_sort_bucketed_table.q,infer_bucket_sort_map_operators.q,infer_bucket_sort_num_buckets.q,leftsemijoin_mr.q,schemeAuthority.q,schemeAuthority2.q,truncate_column_buckets.q,remote_script.q,,load_hdfs_file_with_space_in_the_name.q,parallel_orderby.q,import_exported_table.q,stats_counter.q,auto_sortmerge_join_16.q,quotedid_smb.q,file_with_header_footer.q,external_table_with_space_in_location_path.q,root_dir_external_table.q,index_bitmap3.q,ql_rewrite_gbtoidx.q,index_bitmap_auto.q,udf_using.q,empty_dir_in_table.q,temp_table_external.q minimr.query.negative.files=cluster_tasklog_retrieval.q,minimr_broken_pipe.q,mapreduce_stack_trace.q,mapreduce_stack_trace_turnoff.q,mapreduce_stack_trace_hadoop20.q,mapreduce_stack_trace_turnoff_hadoop20.q,file_with_header_footer_negative.q,udf_local_resource.q minitez.query.files=tez_fsstat.q,mapjoin_decimal.q,tez_join_tests.q,tez_joins_explain.q,mrr.q,tez_dml.q,tez_insert_overwrite_local_directory_1.q,tez_union.q,bucket_map_join_tez1.q,bucket_map_join_tez2.q,tez_schema_evolution.q,tez_join_hash.q -minitez.query.files.shared=orc_merge1.q,orc_merge2.q,orc_merge3.q,orc_merge4.q,alter_merge_orc.q,alter_merge_2_orc.q,alter_merge_stats_orc.q,cross_product_check_1.q,cross_product_check_2.q,dynpart_sort_opt_vectorization.q,dynpart_sort_optimization.q,orc_analyze.q,join0.q,join1.q,auto_join0.q,auto_join1.q,bucket2.q,bucket3.q,bucket4.q,count.q,create_merge_compressed.q,cross_join.q,ctas.q,custom_input_output_format.q,disable_merge_for_bucketing.q,enforce_order.q,filter_join_breaktask.q,filter_join_breaktask2.q,groupby1.q,groupby2.q,groupby3.q,having.q,insert1.q,insert_into1.q,insert_into2.q,leftsemijoin.q,limit_pushdown.q,load_dyn_part1.q,load_dyn_part2.q,load_dyn_part3.q,mapjoin_mapjoin.q,mapreduce1.q,mapreduce2.q,merge1.q,merge2.q,metadata_only_queries.q,sample1.q,subquery_in.q,subquery_exists.q,vectorization_15.q,ptf.q,stats_counter.q,stats_noscan_1.q,stats_counter_partitioned.q,union2.q,union3.q,union4.q,union5.q,union6.q,union7.q,union8.q,union9.q,transform1.q,transform2.q,transform_ppr1.q,transform_ppr2.q,script_env_var1.q,script_env_var2.q,script_pipe.q,scriptfile1.q,metadataonly1.q,temp_table.q,vectorized_ptf.q,optimize_nullscan.q,vector_cast_constant.q,vector_string_concat.q,vector_decimal_aggregate.q,vector_left_outer_join.q,vectorization_12.q,vectorization_13.q,vectorization_14.q,vectorization_9.q,vectorization_part_project.q,vectorization_short_regress.q,vectorized_mapjoin.q,vectorized_nested_mapjoin.q,vectorized_shufflejoin.q,vectorized_timestamp_funcs.q,vector_data_types.q +minitez.query.files.shared=orc_merge1.q,orc_merge2.q,orc_merge3.q,orc_merge4.q,orc_merge5.q,orc_merge6.q,orc_merge7.q,alter_merge_orc.q,alter_merge_2_orc.q,alter_merge_stats_orc.q,cross_product_check_1.q,cross_product_check_2.q,dynpart_sort_opt_vectorization.q,dynpart_sort_optimization.q,orc_analyze.q,join0.q,join1.q,auto_join0.q,auto_join1.q,bucket2.q,bucket3.q,bucket4.q,count.q,create_merge_compressed.q,cross_join.q,ctas.q,custom_input_output_format.q,disable_merge_for_bucketing.q,enforce_order.q,filter_join_breaktask.q,filter_join_breaktask2.q,groupby1.q,groupby2.q,groupby3.q,having.q,insert1.q,insert_into1.q,insert_into2.q,leftsemijoin.q,limit_pushdown.q,load_dyn_part1.q,load_dyn_part2.q,load_dyn_part3.q,mapjoin_mapjoin.q,mapreduce1.q,mapreduce2.q,merge1.q,merge2.q,metadata_only_queries.q,sample1.q,subquery_in.q,subquery_exists.q,vectorization_15.q,ptf.q,stats_counter.q,stats_noscan_1.q,stats_counter_partitioned.q,union2.q,union3.q,union4.q,union5.q,union6.q,union7.q,union8.q,union9.q,transform1.q,transform2.q,transform_ppr1.q,transform_ppr2.q,script_env_var1.q,script_env_var2.q,script_pipe.q,scriptfile1.q,metadataonly1.q,temp_table.q,vectorized_ptf.q,optimize_nullscan.q,vector_cast_constant.q,vector_string_concat.q,vector_decimal_aggregate.q,vector_left_outer_join.q,vectorization_12.q,vectorization_13.q,vectorization_14.q,vectorization_9.q,vectorization_part_project.q,vectorization_short_regress.q,vectorized_mapjoin.q,vectorized_nested_mapjoin.q,vectorized_shufflejoin.q,vectorized_timestamp_funcs.q,vector_data_types.q beeline.positive.exclude=add_part_exist.q,alter1.q,alter2.q,alter4.q,alter5.q,alter_rename_partition.q,alter_rename_partition_authorization.q,archive.q,archive_corrupt.q,archive_multi.q,archive_mr_1806.q,archive_multi_mr_1806.q,authorization_1.q,authorization_2.q,authorization_4.q,authorization_5.q,authorization_6.q,authorization_7.q,ba_table1.q,ba_table2.q,ba_table3.q,ba_table_udfs.q,binary_table_bincolserde.q,binary_table_colserde.q,cluster.q,columnarserde_create_shortcut.q,combine2.q,constant_prop.q,create_nested_type.q,create_or_replace_view.q,create_struct_table.q,create_union_table.q,database.q,database_location.q,database_properties.q,ddltime.q,describe_database_json.q,drop_database_removes_partition_dirs.q,escape1.q,escape2.q,exim_00_nonpart_empty.q,exim_01_nonpart.q,exim_02_00_part_empty.q,exim_02_part.q,exim_03_nonpart_over_compat.q,exim_04_all_part.q,exim_04_evolved_parts.q,exim_05_some_part.q,exim_06_one_part.q,exim_07_all_part_over_nonoverlap.q,exim_08_nonpart_rename.q,exim_09_part_spec_nonoverlap.q,exim_10_external_managed.q,exim_11_managed_external.q,exim_12_external_location.q,exim_13_managed_location.q,exim_14_managed_location_over_existing.q,exim_15_external_part.q,exim_16_part_external.q,exim_17_part_managed.q,exim_18_part_external.q,exim_19_00_part_external_location.q,exim_19_part_external_location.q,exim_20_part_managed_location.q,exim_21_export_authsuccess.q,exim_22_import_exist_authsuccess.q,exim_23_import_part_authsuccess.q,exim_24_import_nonexist_authsuccess.q,global_limit.q,groupby_complex_types.q,groupby_complex_types_multi_single_reducer.q,index_auth.q,index_auto.q,index_auto_empty.q,index_bitmap.q,index_bitmap1.q,index_bitmap2.q,index_bitmap3.q,index_bitmap_auto.q,index_bitmap_rc.q,index_compact.q,index_compact_1.q,index_compact_2.q,index_compact_3.q,index_stale_partitioned.q,init_file.q,input16.q,input16_cc.q,input46.q,input_columnarserde.q,input_dynamicserde.q,input_lazyserde.q,input_testxpath3.q,input_testxpath4.q,insert2_overwrite_partitions.q,insertexternal1.q,join_thrift.q,lateral_view.q,load_binary_data.q,load_exist_part_authsuccess.q,load_nonpart_authsuccess.q,load_part_authsuccess.q,loadpart_err.q,lock1.q,lock2.q,lock3.q,lock4.q,merge_dynamic_partition.q,multi_insert.q,multi_insert_move_tasks_share_dependencies.q,null_column.q,ppd_clusterby.q,query_with_semi.q,rename_column.q,sample6.q,sample_islocalmode_hook.q,set_processor_namespaces.q,show_tables.q,source.q,split_sample.q,str_to_map.q,transform1.q,udaf_collect_set.q,udaf_context_ngrams.q,udaf_histogram_numeric.q,udaf_ngrams.q,udaf_percentile_approx.q,udf_array.q,udf_bitmap_and.q,udf_bitmap_or.q,udf_explode.q,udf_format_number.q,udf_map.q,udf_map_keys.q,udf_map_values.q,udf_max.q,udf_min.q,udf_named_struct.q,udf_percentile.q,udf_printf.q,udf_sentences.q,udf_sort_array.q,udf_split.q,udf_struct.q,udf_substr.q,udf_translate.q,udf_union.q,udf_xpath.q,udtf_stack.q,view.q,virtual_column.q diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/DDLTask.java ql/src/java/org/apache/hadoop/hive/ql/exec/DDLTask.java index cd017d8..07d7de5 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/DDLTask.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/DDLTask.java @@ -88,8 +88,8 @@ import org.apache.hadoop.hive.ql.exec.ArchiveUtils.PartSpecInfo; import org.apache.hadoop.hive.ql.hooks.ReadEntity; import org.apache.hadoop.hive.ql.hooks.WriteEntity; -import org.apache.hadoop.hive.ql.io.merge.MergeTask; -import org.apache.hadoop.hive.ql.io.merge.MergeWork; +import org.apache.hadoop.hive.ql.io.merge.MergeFileTask; +import org.apache.hadoop.hive.ql.io.merge.MergeFileWork; import org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe; import org.apache.hadoop.hive.ql.io.rcfile.truncate.ColumnTruncateTask; import org.apache.hadoop.hive.ql.io.rcfile.truncate.ColumnTruncateWork; @@ -549,14 +549,14 @@ private DataOutputStream getOutputStream(Path outputFile) throws Exception { private int mergeFiles(Hive db, AlterTablePartMergeFilesDesc mergeFilesDesc) throws HiveException { // merge work only needs input and output. - MergeWork mergeWork = new MergeWork(mergeFilesDesc.getInputDir(), - mergeFilesDesc.getOutputDir(), mergeFilesDesc.getInputFormatClass()); + MergeFileWork mergeWork = new MergeFileWork(mergeFilesDesc.getInputDir(), + mergeFilesDesc.getOutputDir(), mergeFilesDesc.getInputFormatClass().getName()); mergeWork.setListBucketingCtx(mergeFilesDesc.getLbCtx()); mergeWork.resolveConcatenateMerge(db.getConf()); mergeWork.setMapperCannotSpanPartns(true); - mergeWork.setSourceTableInputFormat(mergeFilesDesc.getInputFormatClass()); + mergeWork.setSourceTableInputFormat(mergeFilesDesc.getInputFormatClass().getName()); DriverContext driverCxt = new DriverContext(); - MergeTask taskExec = new MergeTask(); + MergeFileTask taskExec = new MergeFileTask(); taskExec.initialize(db.getConf(), null, driverCxt); taskExec.setWork(mergeWork); taskExec.setQueryPlan(this.getQueryPlan()); diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/MapOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/MapOperator.java index d5de58e..64112b2 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/MapOperator.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/MapOperator.java @@ -472,11 +472,16 @@ public void cleanUpInputFileChangedOp() throws HiveException { PartitionDesc partDesc = conf.getPathToPartitionInfo().get(onefile); for (String onealias : conf.getPathToAliases().get(onefile)) { Operator op = conf.getAliasToWork().get(onealias); - MapInputPath inp = new MapInputPath(onefile, onealias, op, partDesc); - MapOpCtx context = opCtxMap.get(inp); - if (context != null) { - current = context; - LOG.info("Processing alias " + onealias + " for file " + onefile); + if (op != null) { + MapInputPath inp = new MapInputPath(onefile, onealias, op, partDesc); + MapOpCtx context = opCtxMap.get(inp); + if (context != null) { + current = context; + LOG.info("Processing alias " + onealias + " for file " + onefile); + return; + } + } else { + // could be MergeFileWork which might not have operator tree return; } } diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/MoveTask.java ql/src/java/org/apache/hadoop/hive/ql/exec/MoveTask.java index a2975cb..5b04880 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/MoveTask.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/MoveTask.java @@ -36,7 +36,7 @@ import org.apache.hadoop.hive.ql.hooks.LineageInfo.DataContainer; import org.apache.hadoop.hive.ql.hooks.WriteEntity; import org.apache.hadoop.hive.ql.io.HiveFileFormatUtils; -import org.apache.hadoop.hive.ql.io.merge.MergeTask; +import org.apache.hadoop.hive.ql.io.merge.MergeFileTask; import org.apache.hadoop.hive.ql.lockmgr.HiveLock; import org.apache.hadoop.hive.ql.lockmgr.HiveLockManager; import org.apache.hadoop.hive.ql.lockmgr.HiveLockObj; @@ -294,7 +294,7 @@ public int execute(DriverContext driverContext) { while (task.getParentTasks() != null && task.getParentTasks().size() == 1) { task = (Task)task.getParentTasks().get(0); // If it was a merge task or a local map reduce task, nothing can be inferred - if (task instanceof MergeTask || task instanceof MapredLocalTask) { + if (task instanceof MergeFileTask || task instanceof MapredLocalTask) { break; } diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/TaskFactory.java ql/src/java/org/apache/hadoop/hive/ql/exec/TaskFactory.java index 24dfed1..5e148c7 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/TaskFactory.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/TaskFactory.java @@ -28,8 +28,8 @@ import org.apache.hadoop.hive.ql.exec.tez.TezTask; import org.apache.hadoop.hive.ql.index.IndexMetadataChangeTask; import org.apache.hadoop.hive.ql.index.IndexMetadataChangeWork; -import org.apache.hadoop.hive.ql.io.merge.MergeTask; -import org.apache.hadoop.hive.ql.io.merge.MergeWork; +import org.apache.hadoop.hive.ql.io.merge.MergeFileTask; +import org.apache.hadoop.hive.ql.io.merge.MergeFileWork; import org.apache.hadoop.hive.ql.io.rcfile.stats.PartialScanTask; import org.apache.hadoop.hive.ql.io.rcfile.stats.PartialScanWork; import org.apache.hadoop.hive.ql.plan.ColumnStatsWork; @@ -92,8 +92,8 @@ public TaskTuple(Class workClass, Class> taskClass) { StatsTask.class)); taskvec.add(new TaskTuple(StatsNoJobWork.class, StatsNoJobTask.class)); taskvec.add(new TaskTuple(ColumnStatsWork.class, ColumnStatsTask.class)); - taskvec.add(new TaskTuple(MergeWork.class, - MergeTask.class)); + taskvec.add(new TaskTuple(MergeFileWork.class, + MergeFileTask.class)); taskvec.add(new TaskTuple(DependencyCollectionWork.class, DependencyCollectionTask.class)); taskvec.add(new TaskTuple(PartialScanWork.class, diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java index 1d6a93a..42e240b 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java @@ -121,7 +121,7 @@ import org.apache.hadoop.hive.ql.io.OneNullRowInputFormat; import org.apache.hadoop.hive.ql.io.RCFile; import org.apache.hadoop.hive.ql.io.ReworkMapredInputFormat; -import org.apache.hadoop.hive.ql.io.merge.MergeWork; +import org.apache.hadoop.hive.ql.io.merge.MergeFileWork; import org.apache.hadoop.hive.ql.io.orc.OrcFileMergeMapper; import org.apache.hadoop.hive.ql.io.rcfile.merge.RCFileMergeMapper; import org.apache.hadoop.hive.ql.io.rcfile.stats.PartialScanMapper; @@ -350,7 +350,7 @@ private static BaseWork getBaseWork(Configuration conf, String name) { gWork = deserializePlan(in, MapWork.class, conf); } else if(RCFileMergeMapper.class.getName().equals(conf.get(MAPRED_MAPPER_CLASS)) || OrcFileMergeMapper.class.getName().equals(conf.get(MAPRED_MAPPER_CLASS))) { - gWork = deserializePlan(in, MergeWork.class, conf); + gWork = deserializePlan(in, MergeFileWork.class, conf); } else if(ColumnTruncateMapper.class.getName().equals(conf.get(MAPRED_MAPPER_CLASS))) { gWork = deserializePlan(in, ColumnTruncateWork.class, conf); } else if(PartialScanMapper.class.getName().equals(conf.get(MAPRED_MAPPER_CLASS))) { @@ -3293,19 +3293,20 @@ private static void createTmpDirs(Configuration conf, FsPermission fsPermission = new FsPermission((short)00777); while (!ops.isEmpty()) { Operator op = ops.remove(0); - - if (op instanceof FileSinkOperator) { - FileSinkDesc fdesc = ((FileSinkOperator) op).getConf(); - Path tempDir = fdesc.getDirName(); - - if (tempDir != null) { - Path tempPath = Utilities.toTempPath(tempDir); - createDirsWithPermission(conf, tempPath, fsPermission); + if (op != null) { + if (op instanceof FileSinkOperator) { + FileSinkDesc fdesc = ((FileSinkOperator) op).getConf(); + Path tempDir = fdesc.getDirName(); + + if (tempDir != null) { + Path tempPath = Utilities.toTempPath(tempDir); + createDirsWithPermission(conf, tempPath, fsPermission); + } } - } - if (op.getChildOperators() != null) { - ops.addAll(op.getChildOperators()); + if (op.getChildOperators() != null) { + ops.addAll(op.getChildOperators()); + } } } } diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/mr/ExecMapper.java ql/src/java/org/apache/hadoop/hive/ql/exec/mr/ExecMapper.java index 4e0fd79..2134a70 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/mr/ExecMapper.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/mr/ExecMapper.java @@ -18,14 +18,6 @@ package org.apache.hadoop.hive.ql.exec.mr; -import java.io.IOException; -import java.lang.management.ManagementFactory; -import java.lang.management.MemoryMXBean; -import java.net.URLClassLoader; -import java.util.Arrays; -import java.util.List; -import java.util.Map; - import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.hive.ql.exec.FetchOperator; @@ -37,6 +29,7 @@ import org.apache.hadoop.hive.ql.exec.OperatorUtils; import org.apache.hadoop.hive.ql.exec.Utilities; import org.apache.hadoop.hive.ql.exec.vector.VectorMapOperator; +import org.apache.hadoop.hive.ql.io.merge.MergeFileWork; import org.apache.hadoop.hive.ql.plan.MapWork; import org.apache.hadoop.hive.ql.plan.MapredLocalWork; import org.apache.hadoop.hive.ql.plan.OperatorDesc; @@ -48,6 +41,14 @@ import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.util.StringUtils; +import java.io.IOException; +import java.lang.management.ManagementFactory; +import java.lang.management.MemoryMXBean; +import java.net.URLClassLoader; +import java.util.Arrays; +import java.util.List; +import java.util.Map; + /** * ExecMapper is the generic Map class for Hive. Together with ExecReducer it is * the bridge between the map-reduce framework and the Hive operator pipeline at @@ -119,8 +120,11 @@ public void configure(JobConf job) { mo = new MapOperator(); } mo.setConf(mrwork); - // initialize map operator - mo.setChildren(job); + // merge file task might not have operators + if (!(mrwork instanceof MergeFileWork)) { + // initialize map operator + mo.setChildren(job); + } l4j.info(mo.dump(0)); // initialize map local work localWork = mrwork.getMapLocalWork(); diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/tez/DagUtils.java ql/src/java/org/apache/hadoop/hive/ql/exec/tez/DagUtils.java index e116426..931ede6 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/tez/DagUtils.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/tez/DagUtils.java @@ -17,21 +17,10 @@ */ package org.apache.hadoop.hive.ql.exec.tez; -import java.io.FileNotFoundException; -import java.io.IOException; -import java.net.URI; -import java.net.URISyntaxException; -import java.nio.ByteBuffer; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.HashSet; -import java.util.Iterator; -import java.util.List; -import java.util.Map; -import java.util.Set; - -import javax.security.auth.login.LoginException; - +import com.google.common.base.Function; +import com.google.common.collect.Iterators; +import com.google.common.collect.Lists; +import com.google.protobuf.ByteString; import org.apache.commons.io.FilenameUtils; import org.apache.commons.lang.StringUtils; import org.apache.commons.logging.Log; @@ -47,12 +36,17 @@ import org.apache.hadoop.hive.ql.exec.Utilities; import org.apache.hadoop.hive.ql.exec.mr.ExecMapper; import org.apache.hadoop.hive.ql.exec.mr.ExecReducer; +import org.apache.hadoop.hive.ql.exec.tez.tools.RCFileMergeFileTezProcessor; import org.apache.hadoop.hive.ql.exec.tez.tools.TezMergedLogicalInput; import org.apache.hadoop.hive.ql.io.BucketizedHiveInputFormat; import org.apache.hadoop.hive.ql.io.CombineHiveInputFormat; import org.apache.hadoop.hive.ql.io.HiveInputFormat; import org.apache.hadoop.hive.ql.io.HiveKey; import org.apache.hadoop.hive.ql.io.HiveOutputFormatImpl; +import org.apache.hadoop.hive.ql.io.RCFileInputFormat; +import org.apache.hadoop.hive.ql.io.merge.MergeFileOutputFormat; +import org.apache.hadoop.hive.ql.io.merge.MergeFileWork; +import org.apache.hadoop.hive.ql.io.orc.OrcInputFormat; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.plan.BaseWork; import org.apache.hadoop.hive.ql.plan.MapWork; @@ -67,6 +61,7 @@ import org.apache.hadoop.hive.shims.ShimLoader; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.DataOutputBuffer; +import org.apache.hadoop.mapred.FileOutputFormat; import org.apache.hadoop.mapred.InputFormat; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.OutputFormat; @@ -95,7 +90,6 @@ import org.apache.tez.dag.api.TezException; import org.apache.tez.dag.api.Vertex; import org.apache.tez.dag.api.VertexGroup; -import org.apache.tez.dag.api.VertexLocationHint; import org.apache.tez.dag.api.VertexManagerPluginDescriptor; import org.apache.tez.dag.library.vertexmanager.ShuffleVertexManager; import org.apache.tez.mapreduce.hadoop.InputSplitInfo; @@ -112,10 +106,19 @@ import org.apache.tez.runtime.library.output.OnFileUnorderedKVOutput; import org.apache.tez.runtime.library.output.OnFileUnorderedPartitionedKVOutput; -import com.google.common.base.Function; -import com.google.common.collect.Iterators; -import com.google.common.collect.Lists; -import com.google.protobuf.ByteString; +import javax.security.auth.login.LoginException; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.net.URI; +import java.net.URISyntaxException; +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Set; /** * DagUtils. DagUtils is a collection of helper methods to convert @@ -200,8 +203,15 @@ private JobConf initializeVertexConf(JobConf baseConf, MapWork mapWork) { inpFormat = CombineHiveInputFormat.class.getName(); } - conf.set("mapred.mapper.class", ExecMapper.class.getName()); - conf.set("mapred.input.format.class", inpFormat); + if (mapWork instanceof MergeFileWork) { + MergeFileWork mfWork = (MergeFileWork) mapWork; + conf.set("mapred.mapper.class", + mfWork.getMapperClass(mfWork.getSourceTableInputFormat()).getName()); + conf.set("mapred.input.format.class", mfWork.getInputformat()); + } else { + conf.set("mapred.mapper.class", ExecMapper.class.getName()); + conf.set("mapred.input.format.class", inpFormat); + } return conf; } @@ -465,6 +475,25 @@ private Vertex createVertex(JobConf conf, MapWork mapWork, } } + if (mapWork instanceof MergeFileWork) { + conf.setClass("mapred.output.format.class", MergeFileOutputFormat.class, + FileOutputFormat.class); + Path outputPath = ((MergeFileWork) mapWork).getOutputDir(); + // prepare the tmp output directory. The output tmp directory should + // exist before to renaming after job completion + Path tempOutPath = Utilities.toTempPath(outputPath); + try { + if (!fs.exists(tempOutPath)) { + fs.mkdirs(tempOutPath); + } + } catch (IOException e) { + throw new RuntimeException( + "Can't make path " + outputPath + " : " + e.getMessage()); + } + + MergeFileOutputFormat.setMergeOutputPath(conf, outputPath); + } + if (HiveConf.getBoolVar(conf, ConfVars.HIVE_AM_SPLIT_GENERATION) && !mapWork.isUseOneNullRowInputFormat()) { // if we're generating the splits in the AM, we just need to set @@ -480,18 +509,62 @@ private Vertex createVertex(JobConf conf, MapWork mapWork, // set up the operator plan Utilities.setMapWork(conf, mapWork, mrScratchDir, false); - byte[] serializedConf = MRHelpers.createUserPayloadFromConf(conf); - map = new Vertex(mapWork.getName(), - new ProcessorDescriptor(MapTezProcessor.class.getName()). - setUserPayload(serializedConf), numTasks, getContainerResource(conf)); + final byte[] serializedConf; + if (mapWork instanceof MergeFileWork) { + MergeFileWork mfWork = (MergeFileWork) mapWork; + conf.setBoolean( + HiveConf.ConfVars.HIVEMERGECURRENTJOBHASDYNAMICPARTITIONS.name(), + mfWork.hasDynamicPartitions()); + conf.setBoolean( + HiveConf.ConfVars.HIVEMERGECURRENTJOBCONCATENATELISTBUCKETING.name(), + mfWork.isListBucketingAlterTableConcatenate()); + conf.setInt( + HiveConf.ConfVars.HIVEMERGECURRENTJOBCONCATENATELISTBUCKETINGDEPTH + .name(), + ((mfWork.getListBucketingCtx() == null) ? 0 : + mfWork.getListBucketingCtx() + .calculateListBucketingLevel())); + serializedConf = MRHelpers.createUserPayloadFromConf(conf); + if (mfWork.getSourceTableInputFormat().equals( + OrcInputFormat.class.getName())) { + map = new Vertex(mapWork.getName(), + new ProcessorDescriptor(OrcMergeFileTezProcessor.class.getName()). + setUserPayload(serializedConf), numTasks, + getContainerResource(conf)); + } else if (mfWork.getSourceTableInputFormat().equals( + RCFileInputFormat.class.getName())) { + map = new Vertex(mapWork.getName(), + new ProcessorDescriptor(RCFileMergeFileTezProcessor.class.getName()) + . + setUserPayload(serializedConf), numTasks, + getContainerResource(conf)); + } else { + throw new RuntimeException("Unsupported merge file input format " + + mapWork.getInputformat().getClass().getName()); + } + } else { + serializedConf = MRHelpers.createUserPayloadFromConf(conf); + map = new Vertex(mapWork.getName(), + new ProcessorDescriptor(MapTezProcessor.class.getName()). + setUserPayload(serializedConf), numTasks, + getContainerResource(conf)); + } + Map environment = new HashMap(); MRHelpers.updateEnvironmentForMRTasks(conf, environment, true); map.setTaskEnvironment(environment); map.setJavaOpts(getContainerJavaOpts(conf)); - assert mapWork.getAliasToWork().keySet().size() == 1; + if (!(mapWork instanceof MergeFileWork)) { + assert mapWork.getAliasToWork().keySet().size() == 1; + } - String alias = mapWork.getAliasToWork().keySet().iterator().next(); + final String alias; + if (!mapWork.getAliasToWork().keySet().isEmpty()) { + alias = mapWork.getAliasToWork().keySet().iterator().next(); + } else { + alias = mapWork.getPathToAliases().keySet().iterator().next(); + } byte[] mrInput = null; if (useTezGroupedSplits) { @@ -782,7 +855,7 @@ public String getBaseName(LocalResource lr) { } /** - * @param pathStr - the string from which we try to determine the resource base name + * @param path - the string from which we try to determine the resource base name * @return the name of the resource from a given path string. */ public String getResourceBaseName(Path path) { @@ -917,7 +990,6 @@ public JobConf initializeVertexConf(JobConf conf, BaseWork work) { * @param work The instance of BaseWork representing the actual work to be performed * by this vertex. * @param scratchDir HDFS scratch dir for this execution unit. - * @param list * @param appJarLr Local resource for hive-exec. * @param additionalLr * @param fileSystem FS corresponding to scratchDir and LocalResources diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/tez/MapRecordProcessor.java ql/src/java/org/apache/hadoop/hive/ql/exec/tez/MapRecordProcessor.java index 8513e33..7e1e7d3 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/tez/MapRecordProcessor.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/tez/MapRecordProcessor.java @@ -17,11 +17,6 @@ */ package org.apache.hadoop.hive.ql.exec.tez; -import java.io.IOException; -import java.util.List; -import java.util.Map; -import java.util.Map.Entry; - import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; @@ -37,6 +32,7 @@ import org.apache.hadoop.hive.ql.exec.mr.ExecMapperContext; import org.apache.hadoop.hive.ql.exec.tez.TezProcessor.TezKVOutputCollector; import org.apache.hadoop.hive.ql.exec.vector.VectorMapOperator; +import org.apache.hadoop.hive.ql.io.merge.MergeFileWork; import org.apache.hadoop.hive.ql.log.PerfLogger; import org.apache.hadoop.hive.ql.plan.MapWork; import org.apache.hadoop.hive.ql.plan.OperatorDesc; @@ -50,6 +46,11 @@ import org.apache.tez.runtime.api.TezProcessorContext; import org.apache.tez.runtime.library.api.KeyValueReader; +import java.io.IOException; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; + /** * Process input from tez LogicalInput and write output - for a map plan * Just pump the records through the query plan. @@ -111,7 +112,10 @@ void init(JobConf jconf, TezProcessorContext processorContext, MRTaskReporter mr // initialize map operator mapOp.setConf(mapWork); - mapOp.setChildren(jconf); + // MergeFileWork might not have operator tree + if (!(mapWork instanceof MergeFileWork)) { + mapOp.setChildren(jconf); + } l4j.info(mapOp.dump(0)); MapredContext.init(true, new JobConf(jconf)); @@ -199,7 +203,7 @@ private boolean processRow(Object value) { } @Override - void close(){ + void close() throws IOException { // check if there are IOExceptions if (!abort) { abort = execContext.getIoCxt().getIOExceptions(); diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/tez/MapTezProcessor.java ql/src/java/org/apache/hadoop/hive/ql/exec/tez/MapTezProcessor.java index 31f3bcd..eb21463 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/tez/MapTezProcessor.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/tez/MapTezProcessor.java @@ -21,7 +21,12 @@ * Subclass that is used to indicate if this is a map or reduce process */ public class MapTezProcessor extends TezProcessor { + public MapTezProcessor(){ super(true); } + + public MapTezProcessor(MergeFileFormat fileFormat){ + super(true, fileFormat); + } } diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/tez/MergeFileMapRecordProcessor.java ql/src/java/org/apache/hadoop/hive/ql/exec/tez/MergeFileMapRecordProcessor.java new file mode 100644 index 0000000..eed5db3 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/tez/MergeFileMapRecordProcessor.java @@ -0,0 +1,232 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.exec.tez; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.ql.exec.Utilities; +import org.apache.hadoop.hive.ql.io.merge.MergeFileOutputFormat; +import org.apache.hadoop.mapred.JobConf; +import org.apache.tez.mapreduce.processor.MRTaskReporter; +import org.apache.tez.runtime.api.LogicalInput; +import org.apache.tez.runtime.api.LogicalOutput; +import org.apache.tez.runtime.api.TezProcessorContext; + +import java.io.IOException; +import java.util.Map; + +/** + * Map record processor for fast merging of files. + */ +public class MergeFileMapRecordProcessor extends MapRecordProcessor { + + public static final Log LOG = LogFactory + .getLog(MergeFileMapRecordProcessor.class); + protected JobConf jc; + protected FileSystem fs; + protected boolean autoDelete; + protected boolean exception; + protected Path outPath; + protected Path finalPath; + protected Path dpPath; + protected Path tmpPath; + protected Path taskTmpPath; + protected int listBucketingDepth; + protected boolean hasDynamicPartitions; + protected boolean isListBucketingAlterTableConcatenate; + protected boolean tmpPathFixedConcatenate; + protected boolean tmpPathFixed; + + @Override + void init(JobConf jconf, TezProcessorContext processorContext, + MRTaskReporter mrReporter, Map inputs, + Map outputs) throws Exception { + super.init(jconf, processorContext, mrReporter, inputs, outputs); + this.jc = jconf; + hasDynamicPartitions = jconf.getBoolean( + HiveConf.ConfVars.HIVEMERGECURRENTJOBHASDYNAMICPARTITIONS.name(), + HiveConf.ConfVars.HIVEMERGECURRENTJOBHASDYNAMICPARTITIONS.defaultBoolVal); + isListBucketingAlterTableConcatenate = jconf.getBoolean( + HiveConf.ConfVars.HIVEMERGECURRENTJOBCONCATENATELISTBUCKETING.name(), + HiveConf.ConfVars.HIVEMERGECURRENTJOBCONCATENATELISTBUCKETING.defaultBoolVal); + listBucketingDepth = jconf.getInt( + HiveConf.ConfVars.HIVEMERGECURRENTJOBCONCATENATELISTBUCKETINGDEPTH + .name(), + HiveConf.ConfVars.HIVEMERGECURRENTJOBCONCATENATELISTBUCKETINGDEPTH.defaultIntVal); + Path specPath = MergeFileOutputFormat.getMergeOutputPath(jconf); + Path tmpPath = Utilities.toTempPath(specPath); + Path taskTmpPath = Utilities.toTaskTempPath(specPath); + updatePaths(tmpPath, taskTmpPath); + try { + fs = specPath.getFileSystem(jconf); + autoDelete = fs.deleteOnExit(outPath); + } catch (IOException e) { + this.exception = true; + throw new RuntimeException(e); + } + } + + @Override + void close() throws IOException { + if (!exception) { + FileStatus fss = fs.getFileStatus(outPath); + if (!fs.rename(outPath, finalPath)) { + throw new IOException( + "Unable to rename " + outPath + " to " + finalPath); + } + LOG.info("renamed path " + outPath + " to " + finalPath + + " . File size is " + fss.getLen()); + } else { + if (!autoDelete) { + fs.delete(outPath, true); + } + } + } + + private void updatePaths(Path tmpPath, Path taskTmpPath) { + String taskId = Utilities.getTaskId(jc); + this.tmpPath = tmpPath; + this.taskTmpPath = taskTmpPath; + finalPath = new Path(tmpPath, taskId); + outPath = new Path(taskTmpPath, Utilities.toTempPath(taskId)); + } + + /** + * Fixes tmpPath to point to the correct partition. Before this is called, tmpPath will default to + * the root tmp table dir fixTmpPath(..) works for DP + LB + multiple skewed values + merge. + * reason: 1. fixTmpPath(..) compares inputPath and tmpDepth, find out path difference and put it + * into newPath. Then add newpath to existing this.tmpPath and this.taskTmpPath. 2. The path + * difference between inputPath and tmpDepth can be DP or DP+LB. It will automatically handle it. + * 3. For example, if inputpath is /-ext-10002/hr=a1/HIVE_DEFAULT_LIST_BUCKETING_DIR_NAME/ + * HIVE_DEFAULT_LIST_BUCKETING_DIR_NAME tmppath is /_tmp.-ext-10000 newpath will be + * hr=a1/HIVE_DEFAULT_LIST_BUCKETING_DIR_NAME/HIVE_DEFAULT_LIST_BUCKETING_DIR_NAME Then, + * this.tmpPath and this.taskTmpPath will be update correctly. We have list_bucket_dml_6.q cover + * this case: DP + LP + multiple skewed values + merge. + * + * @param inputPath - input path + * @throws IOException + */ + protected void fixTmpPath(Path inputPath) throws IOException { + dpPath = inputPath; + Path newPath = new Path("."); + int inputDepth = inputPath.depth(); + int tmpDepth = tmpPath.depth(); + + // Build the path from bottom up + while (inputPath != null && inputDepth > tmpDepth) { + newPath = new Path(inputPath.getName(), newPath); + inputDepth--; + inputPath = inputPath.getParent(); + } + + Path newTmpPath = new Path(tmpPath, newPath); + Path newTaskTmpPath = new Path(taskTmpPath, newPath); + if (!fs.exists(newTmpPath)) { + fs.mkdirs(newTmpPath); + } + updatePaths(newTmpPath, newTaskTmpPath); + } + + /** + * Fixes tmpPath to point to the correct list bucketing sub-directories. Before this is called, + * tmpPath will default to the root tmp table dir Reason to add a new method instead of changing + * fixTmpPath() Reason 1: logic has slightly difference fixTmpPath(..) needs 2 variables in order + * to decide path delta which is in variable newPath. 1. inputPath.depth() 2. tmpPath.depth() + * fixTmpPathConcatenate needs 2 variables too but one of them is different from fixTmpPath(..) 1. + * inputPath.depth() 2. listBucketingDepth Reason 2: less risks The existing logic is a little not + * trivial around map() and fixTmpPath(). In order to ensure minimum impact on existing flow, we + * try to avoid change on existing code/flow but add new code for new feature. + * + * @param inputPath - input path + * @throws IOException + */ + protected void fixTmpPathConcatenate(Path inputPath) throws IOException { + dpPath = inputPath; + Path newPath = new Path("."); + + int depth = listBucketingDepth; + // Build the path from bottom up. pick up list bucketing subdirectories + while ((inputPath != null) && (depth > 0)) { + newPath = new Path(inputPath.getName(), newPath); + inputPath = inputPath.getParent(); + depth--; + } + + Path newTmpPath = new Path(tmpPath, newPath); + Path newTaskTmpPath = new Path(taskTmpPath, newPath); + if (!fs.exists(newTmpPath)) { + fs.mkdirs(newTmpPath); + } + updatePaths(newTmpPath, newTaskTmpPath); + } + + /** + * Validates that each input path belongs to the same partition since each mapper merges the input + * to a single output directory + * + * @param inputPath - input path + */ + protected void checkPartitionsMatch(Path inputPath) throws IOException { + if (!dpPath.equals(inputPath)) { + // Temp partition input path does not match exist temp path + String msg = "Multiple partitions for one merge mapper: " + dpPath + + " NOT EQUAL TO " + + inputPath; + LOG.error(msg); + throw new IOException(msg); + } + } + + protected void fixTmpPathAlterTable(Path path) throws IOException { + + /** + * 1. boolean isListBucketingAlterTableConcatenate will be true only if it is alter table ... + * concatenate on stored-as-dir so it will handle list bucketing alter table merge in the if + * cause with the help of fixTmpPathConcatenate 2. If it is DML, + * isListBucketingAlterTableConcatenate will be false so that it will be handled by else + * cause. In this else cause, we have another if check. 2.1 the if check will make sure DP or + * LB, we will fix path with the help of fixTmpPath(..). Since both has sub-directories. it + * includes SP + LB. 2.2 only SP without LB, we dont fix path. + */ + + // Fix temp path for alter table ... concatenate + if (isListBucketingAlterTableConcatenate) { + if (this.tmpPathFixedConcatenate) { + checkPartitionsMatch(path); + } else { + fixTmpPathConcatenate(path); + tmpPathFixedConcatenate = true; + } + } else { + if (hasDynamicPartitions || (listBucketingDepth > 0)) { + if (tmpPathFixed) { + checkPartitionsMatch(path); + } else { + // We haven't fixed the TMP path for this mapper yet + fixTmpPath(path); + tmpPathFixed = true; + } + } + } + } + +} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/tez/MergeFileTezProcessor.java ql/src/java/org/apache/hadoop/hive/ql/exec/tez/MergeFileTezProcessor.java new file mode 100644 index 0000000..1c7ec52 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/tez/MergeFileTezProcessor.java @@ -0,0 +1,32 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.exec.tez; + +/** + * A tez processor for fast file merging. + */ +public class MergeFileTezProcessor extends MapTezProcessor { + + public MergeFileTezProcessor() { + super(); + } + + public MergeFileTezProcessor(MergeFileFormat fileFormat) { + super(fileFormat); + } +} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/tez/OrcMergeFileMapRecordProcessor.java ql/src/java/org/apache/hadoop/hive/ql/exec/tez/OrcMergeFileMapRecordProcessor.java new file mode 100644 index 0000000..c0ba725 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/tez/OrcMergeFileMapRecordProcessor.java @@ -0,0 +1,171 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.exec.tez; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.fs.FSDataInputStream; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.ql.io.orc.*; +import org.apache.hadoop.hive.shims.CombineHiveKey; +import org.apache.tez.mapreduce.input.MRInputLegacy; +import org.apache.tez.runtime.library.api.KeyValueReader; + +import java.io.IOException; +import java.util.List; + +/** + * Map record processor for fast merging of ORC files + */ +public class OrcMergeFileMapRecordProcessor + extends MergeFileMapRecordProcessor { + public static final Log + LOG = LogFactory.getLog(OrcMergeFileMapRecordProcessor.class); + private Path prevPath; + private Reader reader; + private Writer outWriter; + private CompressionKind compression; + private long compressBuffSize; + private List version; + private int columnCount; + private long rowIndexStride; + private FSDataInputStream fdis; + + @Override + void run() throws IOException { + MRInputLegacy in = TezProcessor.getMRInput(inputs); + KeyValueReader reader = in.getReader(); + + //process records until done + while (reader.next()) { + processKeyValuePairs(reader.getCurrentKey(), reader.getCurrentValue()); + } + } + + private void processKeyValuePairs(Object key, Object value) + throws IOException { + try { + OrcFileValueWrapper v; + OrcFileKeyWrapper k; + if (key instanceof CombineHiveKey) { + k = (OrcFileKeyWrapper) ((CombineHiveKey) key).getKey(); + } else { + k = (OrcFileKeyWrapper) key; + } + + fixTmpPathAlterTable(k.getInputPath().getParent()); + + v = (OrcFileValueWrapper) value; + + if (prevPath == null) { + prevPath = k.getInputPath(); + reader = OrcFile.createReader(fs, k.getInputPath()); + } + + // store the orc configuration from the first file. All other files should + // match this configuration before merging + if (outWriter == null) { + compression = k.getCompression(); + compressBuffSize = k.getCompressBufferSize(); + version = k.getVersionList(); + columnCount = k.getTypes().get(0).getSubtypesCount(); + rowIndexStride = k.getRowIndexStride(); + + // block size and stripe size will be from config + outWriter = OrcFile.createWriter(outPath, + OrcFile.writerOptions(jconf).compress(compression) + .inspector(reader.getObjectInspector())); + LOG.info("ORC merge file output path: " + outPath); + } + // check compatibility with subsequent files + if ((k.getTypes().get(0).getSubtypesCount() != columnCount)) { + throw new IOException( + "ORCFileMerge failed because the input files are not compatible." + + " Column counts does not match."); + } + + if (!k.getCompression().equals(compression)) { + throw new IOException( + "ORCFileMerge failed because the input files are not compatible." + + " Compression codec does not match."); + } + + if (k.getCompressBufferSize() != compressBuffSize) { + throw new IOException( + "ORCFileMerge failed because the input files are not compatible." + + " Compression buffer size does not match."); + + } + + if (!k.getVersionList().equals(version)) { + throw new IOException( + "ORCFileMerge failed because the input files are not compatible." + + " Version does not match."); + } + + if (k.getRowIndexStride() != rowIndexStride) { + throw new IOException( + "ORCFileMerge failed because the input files are not compatible." + + " Row index stride does not match."); + } + + // next file in the path + if (!k.getInputPath().equals(prevPath)) { + reader = OrcFile.createReader(fs, k.getInputPath()); + } + + // initialize buffer to read the entire stripe + byte[] buffer = new byte[(int) v.getStripeInformation().getLength()]; + fdis = fs.open(k.getInputPath()); + fdis.readFully(v.getStripeInformation().getOffset(), buffer, 0, + (int) v.getStripeInformation().getLength()); + + // append the stripe buffer to the new ORC file + outWriter.appendStripe(buffer, 0, buffer.length, v.getStripeInformation(), + v.getStripeStatistics()); + + LOG.info("Merged stripe from file " + k.getInputPath() + " [ offset : " + + v.getStripeInformation().getOffset() + " length: " + + v.getStripeInformation().getLength() + " ]"); + + // add user metadata to footer in case of any + if (v.isLastStripeInFile()) { + outWriter.appendUserMetadata(v.getUserMetadata()); + } + } catch (Throwable e) { + this.exception = true; + close(); + throw new IOException(e); + } + } + + @Override + void close() throws IOException { + if (fdis != null) { + fdis.close(); + fdis = null; + } + + if (outWriter != null) { + outWriter.close(); + outWriter = null; + } + + super.close(); + } +} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/tez/OrcMergeFileTezProcessor.java ql/src/java/org/apache/hadoop/hive/ql/exec/tez/OrcMergeFileTezProcessor.java new file mode 100644 index 0000000..2ae04d8 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/tez/OrcMergeFileTezProcessor.java @@ -0,0 +1,27 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.exec.tez; + +/** + * A tez processor for fast file merging of ORC files. + */ +public class OrcMergeFileTezProcessor extends MergeFileTezProcessor { + public OrcMergeFileTezProcessor() { + super(MergeFileFormat.ORC); + } +} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/tez/RCFileMergeFileMapRecordProcessor.java ql/src/java/org/apache/hadoop/hive/ql/exec/tez/RCFileMergeFileMapRecordProcessor.java new file mode 100644 index 0000000..50689bc --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/tez/RCFileMergeFileMapRecordProcessor.java @@ -0,0 +1,108 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.exec.tez; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hive.ql.io.RCFile; +import org.apache.hadoop.hive.ql.io.RCFileOutputFormat; +import org.apache.hadoop.hive.ql.io.rcfile.merge.RCFileKeyBufferWrapper; +import org.apache.hadoop.hive.ql.io.rcfile.merge.RCFileValueBufferWrapper; +import org.apache.hadoop.hive.shims.CombineHiveKey; +import org.apache.hadoop.io.compress.CompressionCodec; +import org.apache.tez.mapreduce.input.MRInputLegacy; +import org.apache.tez.runtime.library.api.KeyValueReader; + +import java.io.IOException; + +/** + * Map record processor for fast merging of RCfile + */ +public class RCFileMergeFileMapRecordProcessor + extends MergeFileMapRecordProcessor { + public final static Log LOG = LogFactory + .getLog("RCFileMergeFileMapRecordProcessor"); + RCFile.Writer outWriter; + CompressionCodec codec = null; + int columnNumber = 0; + + @Override + void run() throws IOException { + MRInputLegacy in = TezProcessor.getMRInput(inputs); + KeyValueReader reader = in.getReader(); + + //process records until done + while (reader.next()) { + processKeyValuePairs(reader.getCurrentKey(), reader.getCurrentValue()); + } + } + + private void processKeyValuePairs(Object k, Object v) + throws IOException { + try { + + RCFileKeyBufferWrapper key; + if (k instanceof CombineHiveKey) { + key = (RCFileKeyBufferWrapper) ((CombineHiveKey) k).getKey(); + } else { + key = (RCFileKeyBufferWrapper) k; + } + RCFileValueBufferWrapper value = (RCFileValueBufferWrapper) v; + fixTmpPathAlterTable(key.getInputPath().getParent()); + + if (outWriter == null) { + codec = key.getCodec(); + columnNumber = key.getKeyBuffer().getColumnNumber(); + RCFileOutputFormat.setColumnNumber(jc, columnNumber); + outWriter = new RCFile.Writer(fs, jc, outPath, null, codec); + } + + boolean sameCodec = ((codec == key.getCodec()) || codec.getClass().equals( + key.getCodec().getClass())); + + if ((key.getKeyBuffer().getColumnNumber() != columnNumber) || + (!sameCodec)) { + throw new IOException( + "RCFileMerge failed because the input files use different " + + "CompressionCodec or have different column number setting."); + } + + outWriter.flushBlock(key.getKeyBuffer(), value.getValueBuffer(), + key.getRecordLength(), + key.getKeyLength(), key.getCompressedKeyLength()); + } catch (Throwable e) { + this.exception = true; + close(); + throw new IOException(e); + } + } + + + @Override + void close() throws IOException { + // close writer + if (outWriter == null) { + return; + } + + outWriter.close(); + outWriter = null; + + super.close(); + } +} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/tez/RecordProcessor.java ql/src/java/org/apache/hadoop/hive/ql/exec/tez/RecordProcessor.java index 1577827..6845bd9 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/tez/RecordProcessor.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/tez/RecordProcessor.java @@ -16,6 +16,7 @@ * limitations under the License. */ package org.apache.hadoop.hive.ql.exec.tez; +import java.io.IOException; import java.lang.management.ManagementFactory; import java.lang.management.MemoryMXBean; import java.net.URLClassLoader; @@ -109,7 +110,7 @@ void init(JobConf jconf, TezProcessorContext processorContext, MRTaskReporter mr abstract void run() throws Exception; - abstract void close(); + abstract void close() throws IOException; /** * Log information to be logged at the end diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/tez/TezProcessor.java ql/src/java/org/apache/hadoop/hive/ql/exec/tez/TezProcessor.java index c2ba782..620c286 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/tez/TezProcessor.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/tez/TezProcessor.java @@ -45,12 +45,14 @@ * Does what ExecMapper and ExecReducer does for hive in MR framework. */ public class TezProcessor implements LogicalIOProcessor { - - - private static final Log LOG = LogFactory.getLog(TezProcessor.class); - private boolean isMap = false; + private boolean isMap = false; + public static enum MergeFileFormat { + ORC, RCFile + } + private boolean isMergeFile = false; + private MergeFileFormat mergeFileFormat; RecordProcessor rproc = null; private JobConf jobConf; @@ -70,7 +72,13 @@ } public TezProcessor(boolean isMap) { + this(isMap, null); + } + + public TezProcessor(boolean isMap, MergeFileFormat fileFormat) { this.isMap = isMap; + this.isMergeFile = true; + this.mergeFileFormat = fileFormat; } @Override @@ -136,7 +144,15 @@ public void run(Map inputs, Map out LOG.info("Running task: " + processorContext.getUniqueIdentifier()); if (isMap) { - rproc = new MapRecordProcessor(); + if (isMergeFile && mergeFileFormat != null) { + if (mergeFileFormat.equals(MergeFileFormat.ORC)) { + rproc = new OrcMergeFileMapRecordProcessor(); + } else { + rproc = new RCFileMergeFileMapRecordProcessor(); + } + } else { + rproc = new MapRecordProcessor(); + } MRInputLegacy mrInput = getMRInput(inputs); try { mrInput.init(); diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/tez/TezTask.java ql/src/java/org/apache/hadoop/hive/ql/exec/tez/TezTask.java index 951e918..ab2d658 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/tez/TezTask.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/tez/TezTask.java @@ -18,25 +18,18 @@ package org.apache.hadoop.hive.ql.exec.tez; -import java.util.Collection; -import java.util.Collections; -import java.util.EnumSet; -import java.util.HashMap; -import java.util.LinkedList; -import java.util.List; -import java.util.Map; -import java.util.Set; - import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; -import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.ql.Context; import org.apache.hadoop.hive.ql.DriverContext; import org.apache.hadoop.hive.ql.exec.Operator; import org.apache.hadoop.hive.ql.exec.Task; import org.apache.hadoop.hive.ql.exec.Utilities; +import org.apache.hadoop.hive.ql.io.merge.MergeFileWork; import org.apache.hadoop.hive.ql.log.PerfLogger; +import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.plan.BaseWork; +import org.apache.hadoop.hive.ql.plan.DynamicPartitionCtx; import org.apache.hadoop.hive.ql.plan.MapWork; import org.apache.hadoop.hive.ql.plan.OperatorDesc; import org.apache.hadoop.hive.ql.plan.ReduceWork; @@ -47,6 +40,7 @@ import org.apache.hadoop.hive.ql.plan.api.StageType; import org.apache.hadoop.hive.ql.session.SessionState; import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.util.StringUtils; import org.apache.hadoop.yarn.api.records.LocalResource; import org.apache.tez.common.counters.CounterGroup; @@ -61,6 +55,16 @@ import org.apache.tez.dag.api.client.DAGClient; import org.apache.tez.dag.api.client.StatusGetOpts; +import java.io.IOException; +import java.util.Collection; +import java.util.Collections; +import java.util.EnumSet; +import java.util.HashMap; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Set; + /** * * TezTask handles the execution of TezWork. Currently it executes a graph of map and reduce work @@ -71,11 +75,14 @@ public class TezTask extends Task { private static final String CLASS_NAME = TezTask.class.getName(); + public static String BACKUP_PREFIX = "_backup."; private final PerfLogger perfLogger = PerfLogger.getPerfLogger(); private TezCounters counters; private final DagUtils utils; + private boolean success = true; + private JobConf jobConf; public TezTask() { this(DagUtils.getInstance()); @@ -113,7 +120,7 @@ public int execute(DriverContext driverContext) { ss.setTezSession(session); // jobConf will hold all the configuration for hadoop, tez, and hive - JobConf jobConf = utils.createConfiguration(conf); + jobConf = utils.createConfiguration(conf); // Get all user jars from work (e.g. input format stuff). String[] inputOutputJars = work.configureJobConfAndExtractJars(jobConf); @@ -177,7 +184,9 @@ public int execute(DriverContext driverContext) { } catch (Exception e) { LOG.error("Failed to execute tez graph.", e); // rc will be 1 at this point indicating failure. + success = false; } finally { + Utilities.clearWork(conf); if (cleanContext) { try { @@ -331,6 +340,14 @@ int close(TezWork work, int rc) { for (Operator op: w.getAllOperators()) { op.jobClose(conf, rc == 0); } + + // For MergeFileWork moving of temp files to right place has to be done + // manually as there is no operator tree to handle jobClose. + if (w instanceof MergeFileWork) { + Path outPath = ((MergeFileWork) w).getOutputDir(); + closeJob(outPath, success, jobConf, + ((MergeFileWork) w).getDynPartCtx(), null); + } } } catch (Exception e) { // jobClose needs to execute successfully otherwise fail task @@ -344,6 +361,30 @@ int close(TezWork work, int rc) { return rc; } + private Path backupOutputPath(FileSystem fs, Path outpath) + throws IOException, HiveException { + if (fs.exists(outpath)) { + Path backupPath = new Path(outpath.getParent(), BACKUP_PREFIX + + outpath.getName()); + Utilities.rename(fs, outpath, backupPath); + return backupPath; + } else { + return null; + } + } + + private void closeJob(Path outputPath, boolean success, JobConf job, + DynamicPartitionCtx dynPartCtx, Reporter reporter + ) throws HiveException, IOException { + FileSystem fs = outputPath.getFileSystem(job); + Path backupPath = backupOutputPath(fs, outputPath); + Utilities.mvFileToFinalPath(outputPath, job, success, LOG, dynPartCtx, null, + reporter); + if (backupPath != null) { + fs.delete(backupPath, true); + } + } + @Override public boolean isMapRedTask() { return true; diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/tez/tools/RCFileMergeFileTezProcessor.java ql/src/java/org/apache/hadoop/hive/ql/exec/tez/tools/RCFileMergeFileTezProcessor.java new file mode 100644 index 0000000..96a4e1a --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/exec/tez/tools/RCFileMergeFileTezProcessor.java @@ -0,0 +1,29 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.exec.tez.tools; + +import org.apache.hadoop.hive.ql.exec.tez.MergeFileTezProcessor; + +/** + * A tez processor for fast file merging of RCFile + */ +public class RCFileMergeFileTezProcessor extends MergeFileTezProcessor { + public RCFileMergeFileTezProcessor() { + super(MergeFileFormat.RCFile); + } +} diff --git ql/src/java/org/apache/hadoop/hive/ql/io/CombineHiveInputFormat.java ql/src/java/org/apache/hadoop/hive/ql/io/CombineHiveInputFormat.java index bf44548..eb0eebd 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/CombineHiveInputFormat.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/CombineHiveInputFormat.java @@ -18,19 +18,7 @@ package org.apache.hadoop.hive.ql.io; -import java.io.DataInput; -import java.io.DataOutput; -import java.io.IOException; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.HashMap; -import java.util.HashSet; -import java.util.LinkedList; -import java.util.List; -import java.util.Map; -import java.util.Queue; -import java.util.Set; - +import com.google.common.collect.Lists; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.fs.FileStatus; @@ -40,6 +28,7 @@ import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.ql.exec.Operator; import org.apache.hadoop.hive.ql.exec.Utilities; +import org.apache.hadoop.hive.ql.io.merge.MergeFileWork; import org.apache.hadoop.hive.ql.io.orc.OrcInputFormat; import org.apache.hadoop.hive.ql.io.orc.OrcRecordUpdater; import org.apache.hadoop.hive.ql.log.PerfLogger; @@ -61,6 +50,19 @@ import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.mapred.TextInputFormat; +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.HashSet; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Queue; +import java.util.Set; + /** * CombineHiveInputFormat is a parameterized InputFormat which looks at the path @@ -279,8 +281,18 @@ public int hashCode() { // rpc if (HiveConf.getVar(job, HiveConf.ConfVars.HIVE_EXECUTION_ENGINE).equals("tez")) { try { - List dirs = Utilities.getInputPathsTez(job, mrwork); - Utilities.setInputPaths(job, dirs); + if (mrwork instanceof MergeFileWork) { + // MergeFileWork does not have aliasToWork, hence create paths from + // pathToAliases key set + List dirs = Lists.newArrayList(); + for (String path : pathToAliases.keySet()) { + dirs.add(new Path(path)); + } + Utilities.setInputPaths(job, dirs); + } else { + List dirs = Utilities.getInputPathsTez(job, mrwork); + Utilities.setInputPaths(job, dirs); + } } catch (Exception e) { throw new IOException("Could not create input paths", e); } diff --git ql/src/java/org/apache/hadoop/hive/ql/io/merge/MergeFileInputFormat.java ql/src/java/org/apache/hadoop/hive/ql/io/merge/MergeFileInputFormat.java new file mode 100644 index 0000000..92ddce5 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/io/merge/MergeFileInputFormat.java @@ -0,0 +1,35 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.io.merge; + +import java.io.IOException; + +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.InputSplit; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.RecordReader; +import org.apache.hadoop.mapred.Reporter; + +public abstract class MergeFileInputFormat extends FileInputFormat { + + @Override + public abstract RecordReader getRecordReader(InputSplit split, JobConf job, + Reporter reporter) throws IOException; + +} diff --git ql/src/java/org/apache/hadoop/hive/ql/io/merge/MergeFileMapper.java ql/src/java/org/apache/hadoop/hive/ql/io/merge/MergeFileMapper.java new file mode 100644 index 0000000..dd7f730 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/io/merge/MergeFileMapper.java @@ -0,0 +1,217 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.io.merge; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.ql.exec.Utilities; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.MapReduceBase; + +import java.io.IOException; + +public class MergeFileMapper extends MapReduceBase { + protected JobConf jc; + protected Path finalPath; + protected FileSystem fs; + protected boolean exception = false; + protected boolean autoDelete = false; + protected Path outPath; + protected boolean hasDynamicPartitions = false; + protected boolean isListBucketingDML = false; + protected boolean isListBucketingAlterTableConcatenate = false; + //used as depth for dir-calculation and if it is list bucketing case. + protected int listBucketingDepth; + protected boolean tmpPathFixedConcatenate = false; + protected boolean tmpPathFixed = false; + protected Path tmpPath; + protected Path taskTmpPath; + protected Path dpPath; + + public final static Log LOG = LogFactory.getLog("MergeMapper"); + + @Override + public void configure(JobConf job) { + jc = job; + hasDynamicPartitions = HiveConf.getBoolVar(job, + HiveConf.ConfVars.HIVEMERGECURRENTJOBHASDYNAMICPARTITIONS); + isListBucketingAlterTableConcatenate = HiveConf.getBoolVar(job, + HiveConf.ConfVars.HIVEMERGECURRENTJOBCONCATENATELISTBUCKETING); + listBucketingDepth = HiveConf.getIntVar(job, + HiveConf.ConfVars.HIVEMERGECURRENTJOBCONCATENATELISTBUCKETINGDEPTH); + + Path specPath = MergeFileOutputFormat.getMergeOutputPath(job); + Path tmpPath = Utilities.toTempPath(specPath); + Path taskTmpPath = Utilities.toTaskTempPath(specPath); + updatePaths(tmpPath, taskTmpPath); + try { + fs = specPath.getFileSystem(job); + autoDelete = fs.deleteOnExit(outPath); + } catch (IOException e) { + this.exception = true; + throw new RuntimeException(e); + } + } + + private void updatePaths(Path tmpPath, Path taskTmpPath) { + String taskId = Utilities.getTaskId(jc); + this.tmpPath = tmpPath; + this.taskTmpPath = taskTmpPath; + finalPath = new Path(tmpPath, taskId); + outPath = new Path(taskTmpPath, Utilities.toTempPath(taskId)); + } + + + /** + * Validates that each input path belongs to the same partition since each mapper merges the input + * to a single output directory + * @param inputPath + * @throws HiveException + */ + protected void checkPartitionsMatch(Path inputPath) throws HiveException { + if (!dpPath.equals(inputPath)) { + // Temp partition input path does not match exist temp path + String msg = "Multiple partitions for one block merge mapper: " + dpPath + " NOT EQUAL TO " + + inputPath; + LOG.error(msg); + throw new HiveException(msg); + } + } + + /** + * Fixes tmpPath to point to the correct partition. Before this is called, tmpPath will default to + * the root tmp table dir fixTmpPath(..) works for DP + LB + multiple skewed values + merge. + * reason: 1. fixTmpPath(..) compares inputPath and tmpDepth, find out path difference and put it + * into newPath. Then add newpath to existing this.tmpPath and this.taskTmpPath. 2. The path + * difference between inputPath and tmpDepth can be DP or DP+LB. It will automatically handle it. + * 3. For example, if inputpath is /-ext-10002/hr=a1/HIVE_DEFAULT_LIST_BUCKETING_DIR_NAME/ + * HIVE_DEFAULT_LIST_BUCKETING_DIR_NAME tmppath is /_tmp.-ext-10000 newpath will be + * hr=a1/HIVE_DEFAULT_LIST_BUCKETING_DIR_NAME/HIVE_DEFAULT_LIST_BUCKETING_DIR_NAME Then, + * this.tmpPath and this.taskTmpPath will be update correctly. We have list_bucket_dml_6.q cover + * this case: DP + LP + multiple skewed values + merge. + * @param inputPath + * @throws HiveException + * @throws IOException + */ + protected void fixTmpPath(Path inputPath) throws HiveException, IOException { + dpPath = inputPath; + Path newPath = new Path("."); + int inputDepth = inputPath.depth(); + int tmpDepth = tmpPath.depth(); + + // Build the path from bottom up + while (inputPath != null && inputDepth > tmpDepth) { + newPath = new Path(inputPath.getName(), newPath); + inputDepth--; + inputPath = inputPath.getParent(); + } + + Path newTmpPath = new Path(tmpPath, newPath); + Path newTaskTmpPath = new Path(taskTmpPath, newPath); + if (!fs.exists(newTmpPath)) { + fs.mkdirs(newTmpPath); + } + updatePaths(newTmpPath, newTaskTmpPath); + } + + /** + * Fixes tmpPath to point to the correct list bucketing sub-directories. Before this is called, + * tmpPath will default to the root tmp table dir Reason to add a new method instead of changing + * fixTmpPath() Reason 1: logic has slightly difference fixTmpPath(..) needs 2 variables in order + * to decide path delta which is in variable newPath. 1. inputPath.depth() 2. tmpPath.depth() + * fixTmpPathConcatenate needs 2 variables too but one of them is different from fixTmpPath(..) 1. + * inputPath.depth() 2. listBucketingDepth Reason 2: less risks The existing logic is a little not + * trivial around map() and fixTmpPath(). In order to ensure minimum impact on existing flow, we + * try to avoid change on existing code/flow but add new code for new feature. + * @param inputPath + * @throws HiveException + * @throws IOException + */ + protected void fixTmpPathConcatenate(Path inputPath) throws HiveException, IOException { + dpPath = inputPath; + Path newPath = new Path("."); + + int depth = listBucketingDepth; + // Build the path from bottom up. pick up list bucketing subdirectories + while ((inputPath != null) && (depth > 0)) { + newPath = new Path(inputPath.getName(), newPath); + inputPath = inputPath.getParent(); + depth--; + } + + Path newTmpPath = new Path(tmpPath, newPath); + Path newTaskTmpPath = new Path(taskTmpPath, newPath); + if (!fs.exists(newTmpPath)) { + fs.mkdirs(newTmpPath); + } + updatePaths(newTmpPath, newTaskTmpPath); + } + + @Override + public void close() throws IOException { + if (!exception) { + FileStatus fss = fs.getFileStatus(outPath); + LOG.info("renamed path " + outPath + " to " + finalPath + " . File size is " + fss.getLen()); + if (!fs.rename(outPath, finalPath)) { + throw new IOException("Unable to rename output to " + finalPath); + } + } else { + if (!autoDelete) { + fs.delete(outPath, true); + } + } + } + + protected void fixTmpPathAlterTable(Path path) throws IOException, HiveException { + /** + * 1. boolean isListBucketingAlterTableConcatenate will be true only if it is alter table ... + * concatenate on stored-as-dir so it will handle list bucketing alter table merge in the if + * cause with the help of fixTmpPathConcatenate 2. If it is DML, + * isListBucketingAlterTableConcatenate will be false so that it will be handled by else + * cause. In this else cause, we have another if check. 2.1 the if check will make sure DP or + * LB, we will fix path with the help of fixTmpPath(..). Since both has sub-directories. it + * includes SP + LB. 2.2 only SP without LB, we dont fix path. + */ + + // Fix temp path for alter table ... concatenate + if (isListBucketingAlterTableConcatenate) { + if (this.tmpPathFixedConcatenate) { + checkPartitionsMatch(path); + } else { + fixTmpPathConcatenate(path); + tmpPathFixedConcatenate = true; + } + } else { + if (hasDynamicPartitions || (listBucketingDepth > 0)) { + if (tmpPathFixed) { + checkPartitionsMatch(path); + } else { + // We haven't fixed the TMP path for this mapper yet + fixTmpPath(path); + tmpPathFixed = true; + } + } + } + } +} diff --git ql/src/java/org/apache/hadoop/hive/ql/io/merge/MergeFileOutputFormat.java ql/src/java/org/apache/hadoop/hive/ql/io/merge/MergeFileOutputFormat.java new file mode 100644 index 0000000..948b43d --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/io/merge/MergeFileOutputFormat.java @@ -0,0 +1,56 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.io.merge; + +import java.io.IOException; + +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.mapred.FileOutputFormat; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.RecordWriter; +import org.apache.hadoop.mapred.Reporter; +import org.apache.hadoop.util.Progressable; + +public class MergeFileOutputFormat extends + FileOutputFormat { + + public static void setMergeOutputPath(JobConf job, Path path) { + job.set("hive.merge.output.dir", path.toString()); + } + + public static Path getMergeOutputPath(JobConf conf) { + String name = conf.get("hive.merge.output.dir"); + return name == null ? null: new Path(name); + } + + @Override + public RecordWriter getRecordWriter(FileSystem ignored, JobConf job, String name, + Progressable progress) throws IOException { + return new RecordWriter() { + public void write(Object key, Object value) { + throw new RuntimeException("Should not be called"); + } + + public void close(Reporter reporter) { + } + }; + } + +} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/io/merge/MergeFileTask.java ql/src/java/org/apache/hadoop/hive/ql/io/merge/MergeFileTask.java new file mode 100644 index 0000000..6d39371 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/io/merge/MergeFileTask.java @@ -0,0 +1,417 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.io.merge; + +import org.apache.commons.lang.StringUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.ql.Context; +import org.apache.hadoop.hive.ql.DriverContext; +import org.apache.hadoop.hive.ql.QueryPlan; +import org.apache.hadoop.hive.ql.exec.Task; +import org.apache.hadoop.hive.ql.exec.Utilities; +import org.apache.hadoop.hive.ql.exec.mr.HadoopJobExecHelper; +import org.apache.hadoop.hive.ql.exec.mr.HadoopJobExecHook; +import org.apache.hadoop.hive.ql.exec.mr.Throttle; +import org.apache.hadoop.hive.ql.io.CombineHiveInputFormat; +import org.apache.hadoop.hive.ql.io.HiveOutputFormatImpl; +import org.apache.hadoop.hive.ql.io.RCFileInputFormat; +import org.apache.hadoop.hive.ql.io.orc.OrcInputFormat; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.plan.DynamicPartitionCtx; +import org.apache.hadoop.hive.ql.plan.api.StageType; +import org.apache.hadoop.hive.ql.session.SessionState; +import org.apache.hadoop.hive.ql.session.SessionState.LogHelper; +import org.apache.hadoop.hive.shims.ShimLoader; +import org.apache.hadoop.io.NullWritable; +import org.apache.hadoop.mapred.Counters; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.InputFormat; +import org.apache.hadoop.mapred.JobClient; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.Mapper; +import org.apache.hadoop.mapred.Reporter; +import org.apache.hadoop.mapred.RunningJob; +import org.apache.log4j.Appender; +import org.apache.log4j.FileAppender; +import org.apache.log4j.LogManager; + +import java.io.IOException; +import java.io.Serializable; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Enumeration; +import java.util.List; + +public class MergeFileTask extends Task implements Serializable, + HadoopJobExecHook { + + private static final long serialVersionUID = 1L; + + public static String BACKUP_PREFIX = "_backup."; + + protected transient JobConf job; + protected HadoopJobExecHelper jobExecHelper; + + @Override + public void initialize(HiveConf conf, QueryPlan queryPlan, + DriverContext driverContext) { + super.initialize(conf, queryPlan, driverContext); + job = new JobConf(conf, MergeFileTask.class); + jobExecHelper = new HadoopJobExecHelper(job, this.console, this, this); + } + + @Override + public boolean requireLock() { + return true; + } + + boolean success = true; + + @Override + /** + * start a new map-reduce job to do the merge, almost the same as ExecDriver. + */ + public int execute(DriverContext driverContext) { + HiveConf.setVar(job, HiveConf.ConfVars.HIVEINPUTFORMAT, + CombineHiveInputFormat.class.getName()); + success = true; + ShimLoader.getHadoopShims().prepareJobOutput(job); + job.setOutputFormat(HiveOutputFormatImpl.class); + Class mapperClass = work.getMapperClass(work.getSourceTableInputFormat()); + LOG.info("Using " + mapperClass.getCanonicalName() + " mapper class."); + job.setMapperClass(mapperClass); + + Context ctx = driverContext.getCtx(); + boolean ctxCreated = false; + try { + if (ctx == null) { + ctx = new Context(job); + ctxCreated = true; + } + }catch (IOException e) { + e.printStackTrace(); + console.printError("Error launching map-reduce job", "\n" + + org.apache.hadoop.util.StringUtils.stringifyException(e)); + return 5; + } + + job.setMapOutputKeyClass(NullWritable.class); + job.setMapOutputValueClass(NullWritable.class); + if(work.getNumMapTasks() != null) { + job.setNumMapTasks(work.getNumMapTasks()); + } + + // zero reducers + job.setNumReduceTasks(0); + + if (work.getMinSplitSize() != null) { + HiveConf.setLongVar(job, HiveConf.ConfVars.MAPREDMINSPLITSIZE, work + .getMinSplitSize().longValue()); + } + + if (work.getInputformat() != null) { + HiveConf.setVar(job, HiveConf.ConfVars.HIVEINPUTFORMAT, work + .getInputformat()); + } + + String inpFormat = HiveConf.getVar(job, HiveConf.ConfVars.HIVEINPUTFORMAT); + if ((inpFormat == null) || (!StringUtils.isNotBlank(inpFormat))) { + inpFormat = ShimLoader.getHadoopShims().getInputFormatClassName(); + } + + LOG.info("Using " + inpFormat); + + try { + job.setInputFormat((Class) (Class + .forName(inpFormat))); + } catch (ClassNotFoundException e) { + throw new RuntimeException(e.getMessage()); + } + + Path outputPath = this.work.getOutputDir(); + Path tempOutPath = Utilities.toTempPath(outputPath); + try { + FileSystem fs = tempOutPath.getFileSystem(job); + if (!fs.exists(tempOutPath)) { + fs.mkdirs(tempOutPath); + } + } catch (IOException e) { + console.printError("Can't make path " + outputPath + " : " + e.getMessage()); + return 6; + } + + MergeFileOutputFormat.setMergeOutputPath(job, outputPath); + + job.setOutputKeyClass(NullWritable.class); + job.setOutputValueClass(NullWritable.class); + + HiveConf.setBoolVar(job, + HiveConf.ConfVars.HIVEMERGECURRENTJOBHASDYNAMICPARTITIONS, + work.hasDynamicPartitions()); + + HiveConf.setBoolVar(job, + HiveConf.ConfVars.HIVEMERGECURRENTJOBCONCATENATELISTBUCKETING, + work.isListBucketingAlterTableConcatenate()); + + HiveConf.setIntVar( + job, + HiveConf.ConfVars.HIVEMERGECURRENTJOBCONCATENATELISTBUCKETINGDEPTH, + ((work.getListBucketingCtx() == null) ? 0 : work.getListBucketingCtx() + .calculateListBucketingLevel())); + + int returnVal = 0; + RunningJob rj = null; + boolean noName = StringUtils.isEmpty(HiveConf.getVar(job, + HiveConf.ConfVars.HADOOPJOBNAME)); + + String jobName = null; + if (noName && this.getQueryPlan() != null) { + int maxlen = conf.getIntVar(HiveConf.ConfVars.HIVEJOBNAMELENGTH); + jobName = Utilities.abbreviate(this.getQueryPlan().getQueryStr(), + maxlen - 6); + } + + if (noName) { + // This is for a special case to ensure unit tests pass + HiveConf.setVar(job, HiveConf.ConfVars.HADOOPJOBNAME, + jobName != null ? jobName : "JOB" + Utilities.randGen.nextInt()); + } + + try { + addInputPaths(job, work); + + Utilities.setMapWork(job, work, ctx.getMRTmpPath(), true); + + // remove the pwd from conf file so that job tracker doesn't show this + // logs + String pwd = HiveConf.getVar(job, HiveConf.ConfVars.METASTOREPWD); + if (pwd != null) { + HiveConf.setVar(job, HiveConf.ConfVars.METASTOREPWD, "HIVE"); + } + JobClient jc = new JobClient(job); + + String addedJars = Utilities.getResourceFiles(job, SessionState.ResourceType.JAR); + if (!addedJars.isEmpty()) { + job.set("tmpjars", addedJars); + } + + // make this client wait if job trcker is not behaving well. + Throttle.checkJobTracker(job, LOG); + + // Finally SUBMIT the JOB! + rj = jc.submitJob(job); + + returnVal = jobExecHelper.progress(rj, jc, null); + success = (returnVal == 0); + + } catch (Exception e) { + e.printStackTrace(); + String mesg = " with exception '" + Utilities.getNameMessage(e) + "'"; + if (rj != null) { + mesg = "Ended Job = " + rj.getJobID() + mesg; + } else { + mesg = "Job Submission failed" + mesg; + } + + // Has to use full name to make sure it does not conflict with + // org.apache.commons.lang.StringUtils + console.printError(mesg, "\n" + + org.apache.hadoop.util.StringUtils.stringifyException(e)); + + success = false; + returnVal = 1; + } finally { + try { + if (ctxCreated) { + ctx.clear(); + } + if (rj != null) { + if (returnVal != 0) { + rj.killJob(); + } + HadoopJobExecHelper.runningJobs.remove(rj); + jobID = rj.getID().toString(); + } + closeJob(outputPath, success, job, work.getDynPartCtx(), null); + } catch (Exception e) { + } + } + + return (returnVal); + } + + private Path backupOutputPath(FileSystem fs, Path outpath, JobConf job) + throws IOException, HiveException { + if (fs.exists(outpath)) { + Path backupPath = new Path(outpath.getParent(), BACKUP_PREFIX + + outpath.getName()); + Utilities.rename(fs, outpath, backupPath); + return backupPath; + } else { + return null; + } + } + + private void closeJob(Path outputPath, boolean success, JobConf job, + DynamicPartitionCtx dynPartCtx, Reporter reporter + ) throws HiveException, IOException { + FileSystem fs = outputPath.getFileSystem(job); + Path backupPath = backupOutputPath(fs, outputPath, job); + Utilities.mvFileToFinalPath(outputPath, job, success, LOG, dynPartCtx, null, + reporter); + fs.delete(backupPath, true); + } + + private void addInputPaths(JobConf job, MergeFileWork work) { + for (Path path : work.getInputPaths()) { + FileInputFormat.addInputPath(job, path); + } + } + + @Override + public String getName() { + return "MergeFileTask"; + } + + public static String INPUT_SEPERATOR = ":"; + + public static void main(String[] args) { + String inputPathStr = null; + String outputDir = null; + String jobConfFileName = null; + String format = null; + + try { + for (int i = 0; i < args.length; i++) { + if (args[i].equals("-input")) { + inputPathStr = args[++i]; + } else if (args[i].equals("-jobconffile")) { + jobConfFileName = args[++i]; + } else if (args[i].equals("-outputDir")) { + outputDir = args[++i]; + } else if (args[i].equals("-format")) { + format = args[++i]; + } + } + } catch (IndexOutOfBoundsException e) { + System.err.println("Missing argument to option"); + printUsage(); + } + + if (inputPathStr == null || outputDir == null + || outputDir.trim().equals("")) { + printUsage(); + } + + List inputPaths = new ArrayList(); + String[] paths = inputPathStr.split(INPUT_SEPERATOR); + if (paths == null || paths.length == 0) { + printUsage(); + } + + FileSystem fs = null; + JobConf conf = new JobConf(MergeFileTask.class); + for (String path : paths) { + try { + Path pathObj = new Path(path); + if (fs == null) { + fs = FileSystem.get(pathObj.toUri(), conf); + } + FileStatus fstatus = fs.getFileStatus(pathObj); + if (fstatus.isDir()) { + FileStatus[] fileStatus = fs.listStatus(pathObj); + for (FileStatus st : fileStatus) { + inputPaths.add(st.getPath()); + } + } else { + inputPaths.add(fstatus.getPath()); + } + } catch (IOException e) { + e.printStackTrace(System.err); + } + } + + if (jobConfFileName != null) { + conf.addResource(new Path(jobConfFileName)); + } + HiveConf hiveConf = new HiveConf(conf, MergeFileTask.class); + + Log LOG = LogFactory.getLog(MergeFileTask.class.getName()); + boolean isSilent = HiveConf.getBoolVar(conf, + HiveConf.ConfVars.HIVESESSIONSILENT); + LogHelper console = new LogHelper(LOG, isSilent); + + // print out the location of the log file for the user so + // that it's easy to find reason for local mode execution failures + for (Appender appender : Collections + .list((Enumeration) LogManager.getRootLogger() + .getAllAppenders())) { + if (appender instanceof FileAppender) { + console.printInfo("Execution log at: " + + ((FileAppender) appender).getFile()); + } + } + + MergeFileWork mergeWork = null; + if (format.equals("rcfile")) { + mergeWork = new MergeFileWork(inputPaths, new Path(outputDir), + RCFileInputFormat.class.getName()); + } else if (format.equals("orcfile")) { + mergeWork = new MergeFileWork(inputPaths, new Path(outputDir), + OrcInputFormat.class.getName()); + } + + DriverContext driverCxt = new DriverContext(); + MergeFileTask taskExec = new MergeFileTask(); + taskExec.initialize(hiveConf, null, driverCxt); + taskExec.setWork(mergeWork); + int ret = taskExec.execute(driverCxt); + + if (ret != 0) { + System.exit(2); + } + + } + + private static void printUsage() { + System.err.println("MergeFileTask -format -input " + + "-outputDir outputDir [-jobconffile ] "); + System.exit(1); + } + + @Override + public StageType getType() { + return StageType.MAPRED; + } + + @Override + public boolean checkFatalErrors(Counters ctrs, StringBuilder errMsg) { + return false; + } + + @Override + public void logPlanProgress(SessionState ss) throws IOException { + // no op + } +} diff --git ql/src/java/org/apache/hadoop/hive/ql/io/merge/MergeFileWork.java ql/src/java/org/apache/hadoop/hive/ql/io/merge/MergeFileWork.java new file mode 100644 index 0000000..bcee1b3 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/io/merge/MergeFileWork.java @@ -0,0 +1,271 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.io.merge; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.common.HiveStatsUtils; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.ql.io.CombineHiveInputFormat; +import org.apache.hadoop.hive.ql.io.RCFileInputFormat; +import org.apache.hadoop.hive.ql.io.orc.OrcFileMergeMapper; +import org.apache.hadoop.hive.ql.io.orc.OrcFileStripeMergeInputFormat; +import org.apache.hadoop.hive.ql.io.orc.OrcInputFormat; +import org.apache.hadoop.hive.ql.io.rcfile.merge.RCFileBlockMergeInputFormat; +import org.apache.hadoop.hive.ql.io.rcfile.merge.RCFileMergeMapper; +import org.apache.hadoop.hive.ql.plan.*; +import org.apache.hadoop.mapred.InputFormat; +import org.apache.hadoop.mapred.Mapper; + +import java.io.IOException; +import java.io.Serializable; +import java.util.ArrayList; +import java.util.LinkedHashMap; +import java.util.List; + +@Explain(displayName = "Merge File Work") +public class MergeFileWork extends MapWork implements Serializable { + + private static transient final Log LOG = + LogFactory.getLog(MergeFileWork.class); + private transient List inputPaths; + private transient Path outputDir; + private boolean hasDynamicPartitions; + private DynamicPartitionCtx dynPartCtx; + private boolean isListBucketingAlterTableConcatenate; + private ListBucketingCtx listBucketingCtx; + private String srcTblInputFormat; + + public MergeFileWork() { + } + + public MergeFileWork(List inputPaths, Path outputDir, + String srcTblInputFormat) { + this(inputPaths, outputDir, false, null, srcTblInputFormat); + } + + public MergeFileWork(List inputPaths, Path outputDir, + boolean hasDynamicPartitions, + DynamicPartitionCtx dynPartCtx, + String srcTblInputFormat) { + super(); + this.inputPaths = inputPaths; + this.outputDir = outputDir; + this.hasDynamicPartitions = hasDynamicPartitions; + this.dynPartCtx = dynPartCtx; + this.srcTblInputFormat = srcTblInputFormat; + PartitionDesc partDesc = new PartitionDesc(); + if (srcTblInputFormat.equals(OrcInputFormat.class.getName())) { + partDesc.setInputFileFormatClass(OrcFileStripeMergeInputFormat.class); + } else if (srcTblInputFormat.equals(RCFileInputFormat.class.getName())) { + partDesc.setInputFileFormatClass(RCFileBlockMergeInputFormat.class); + } + if (this.getPathToPartitionInfo() == null) { + this.setPathToPartitionInfo(new LinkedHashMap()); + } + for (Path path : this.inputPaths) { + this.getPathToPartitionInfo().put(path.toString(), partDesc); + } + this.isListBucketingAlterTableConcatenate = false; + } + + public List getInputPaths() { + return inputPaths; + } + + public void setInputPaths(List inputPaths) { + this.inputPaths = inputPaths; + } + + public Path getOutputDir() { + return outputDir; + } + + public void setOutputDir(Path outputDir) { + this.outputDir = outputDir; + } + + public Class getMapperClass(String klass) { + if (klass.equals(RCFileInputFormat.class.getName())) { + return RCFileMergeMapper.class; + } else if (klass.equals(OrcInputFormat.class.getName())) { + return OrcFileMergeMapper.class; + } + return null; + } + + @Override + public Long getMinSplitSize() { + return null; + } + + @Override + public String getInputformat() { + return getInputformatClass().getName(); + } + + public Class getInputformatClass() { + return CombineHiveInputFormat.class; + } + + @Override + public boolean isGatheringStats() { + return false; + } + + public boolean hasDynamicPartitions() { + return this.hasDynamicPartitions; + } + + public void setHasDynamicPartitions(boolean hasDynamicPartitions) { + this.hasDynamicPartitions = hasDynamicPartitions; + } + + @Override + public void resolveDynamicPartitionStoredAsSubDirsMerge(HiveConf conf, + Path path, + TableDesc tblDesc, + ArrayList aliases, + PartitionDesc partDesc) { + String inputFormatClass = null; + if (tblDesc.getInputFileFormatClass().equals(RCFileInputFormat.class)) { + inputFormatClass = + conf.getVar(HiveConf.ConfVars.HIVEMERGEINPUTFORMATBLOCKLEVEL); + } else if (tblDesc.getInputFileFormatClass().equals(OrcInputFormat.class)) { + inputFormatClass = + conf.getVar(HiveConf.ConfVars.HIVEMERGEINPUTFORMATSTRIPELEVEL); + } + + try { + partDesc.setInputFileFormatClass((Class) + Class.forName(inputFormatClass)); + } catch (ClassNotFoundException e) { + String msg = "Merge input format class not found"; + throw new RuntimeException(msg); + } + super.resolveDynamicPartitionStoredAsSubDirsMerge(conf, path, tblDesc, + aliases, partDesc); + + // Add the DP path to the list of input paths + inputPaths.add(path); + LOG.info("Updated input paths for merging: " + inputPaths); + } + + /** + * alter table ... concatenate + *

+ * If it is skewed table, use subdirectories in inputpaths. + */ + public void resolveConcatenateMerge(HiveConf conf) { + isListBucketingAlterTableConcatenate = + ((listBucketingCtx == null) ? false : listBucketingCtx + .isSkewedStoredAsDir()); + LOG.info("isListBucketingAlterTableConcatenate : " + isListBucketingAlterTableConcatenate); + if (isListBucketingAlterTableConcatenate) { + // use sub-dir as inputpath. + assert ((this.inputPaths != null) && (this.inputPaths.size() == 1)) : + "alter table ... concatenate should only have one directory inside inputpaths"; + Path dirPath = inputPaths.get(0); + try { + FileSystem inpFs = dirPath.getFileSystem(conf); + FileStatus[] status = + HiveStatsUtils.getFileStatusRecurse(dirPath, listBucketingCtx + .getSkewedColNames().size(), inpFs); + List newInputPath = new ArrayList(); + boolean succeed = true; + for (int i = 0; i < status.length; ++i) { + if (status[i].isDir()) { + // Add the lb path to the list of input paths + newInputPath.add(status[i].getPath()); + } else { + // find file instead of dir. dont change inputpath + succeed = false; + } + } + assert (succeed || ((!succeed) && newInputPath.isEmpty())) : + "This partition has " + + " inconsistent file structure: " + + "it is stored-as-subdir and expected all files in the same depth" + + " of subdirectories."; + if (succeed) { + inputPaths.clear(); + inputPaths.addAll(newInputPath); + } + } catch (IOException e) { + String msg = + "Fail to get filesystem for directory name : " + dirPath.toUri(); + throw new RuntimeException(msg, e); + } + + } + } + + public DynamicPartitionCtx getDynPartCtx() { + return dynPartCtx; + } + + public void setDynPartCtx(DynamicPartitionCtx dynPartCtx) { + this.dynPartCtx = dynPartCtx; + } + + /** + * @return the listBucketingCtx + */ + public ListBucketingCtx getListBucketingCtx() { + return listBucketingCtx; + } + + /** + * @param listBucketingCtx the listBucketingCtx to set + */ + public void setListBucketingCtx(ListBucketingCtx listBucketingCtx) { + this.listBucketingCtx = listBucketingCtx; + } + + /** + * @return the isListBucketingAlterTableConcatenate + */ + public boolean isListBucketingAlterTableConcatenate() { + return isListBucketingAlterTableConcatenate; + } + + @Explain(displayName = "input format") + public String getSourceTableInputFormat() { + return srcTblInputFormat; + } + + public void setSourceTableInputFormat(String srcTblInputFormat) { + this.srcTblInputFormat = srcTblInputFormat; + } + + @Explain(displayName = "merge level") + public String getMergeLevel() { + if (srcTblInputFormat != null) { + if (srcTblInputFormat.equals(OrcInputFormat.class.getName())) { + return "stripe"; + } else if (srcTblInputFormat.equals(RCFileInputFormat.class.getName())) { + return "block"; + } + } + return null; + } +} diff --git ql/src/java/org/apache/hadoop/hive/ql/io/merge/MergeInputFormat.java ql/src/java/org/apache/hadoop/hive/ql/io/merge/MergeInputFormat.java deleted file mode 100644 index 4651920..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/io/merge/MergeInputFormat.java +++ /dev/null @@ -1,35 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.io.merge; - -import java.io.IOException; - -import org.apache.hadoop.mapred.FileInputFormat; -import org.apache.hadoop.mapred.InputSplit; -import org.apache.hadoop.mapred.JobConf; -import org.apache.hadoop.mapred.RecordReader; -import org.apache.hadoop.mapred.Reporter; - -public abstract class MergeInputFormat extends FileInputFormat { - - @Override - public abstract RecordReader getRecordReader(InputSplit split, JobConf job, - Reporter reporter) throws IOException; - -} diff --git ql/src/java/org/apache/hadoop/hive/ql/io/merge/MergeMapper.java ql/src/java/org/apache/hadoop/hive/ql/io/merge/MergeMapper.java deleted file mode 100644 index beb4f7d..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/io/merge/MergeMapper.java +++ /dev/null @@ -1,217 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.io.merge; - -import java.io.IOException; - -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.hive.conf.HiveConf; -import org.apache.hadoop.hive.ql.exec.Utilities; -import org.apache.hadoop.hive.ql.metadata.HiveException; -import org.apache.hadoop.mapred.JobConf; -import org.apache.hadoop.mapred.MapReduceBase; - -public class MergeMapper extends MapReduceBase { - protected JobConf jc; - protected Path finalPath; - protected FileSystem fs; - protected boolean exception = false; - protected boolean autoDelete = false; - protected Path outPath; - protected boolean hasDynamicPartitions = false; - protected boolean isListBucketingDML = false; - protected boolean isListBucketingAlterTableConcatenate = false; - //used as depth for dir-calculation and if it is list bucketing case. - protected int listBucketingDepth; - protected boolean tmpPathFixedConcatenate = false; - protected boolean tmpPathFixed = false; - protected Path tmpPath; - protected Path taskTmpPath; - protected Path dpPath; - - public final static Log LOG = LogFactory.getLog("MergeMapper"); - - @Override - public void configure(JobConf job) { - jc = job; - hasDynamicPartitions = HiveConf.getBoolVar(job, - HiveConf.ConfVars.HIVEMERGECURRENTJOBHASDYNAMICPARTITIONS); - isListBucketingAlterTableConcatenate = HiveConf.getBoolVar(job, - HiveConf.ConfVars.HIVEMERGECURRENTJOBCONCATENATELISTBUCKETING); - listBucketingDepth = HiveConf.getIntVar(job, - HiveConf.ConfVars.HIVEMERGECURRENTJOBCONCATENATELISTBUCKETINGDEPTH); - - Path specPath = MergeOutputFormat.getMergeOutputPath(job); - Path tmpPath = Utilities.toTempPath(specPath); - Path taskTmpPath = Utilities.toTaskTempPath(specPath); - updatePaths(tmpPath, taskTmpPath); - try { - fs = specPath.getFileSystem(job); - autoDelete = fs.deleteOnExit(outPath); - } catch (IOException e) { - this.exception = true; - throw new RuntimeException(e); - } - } - - private void updatePaths(Path tmpPath, Path taskTmpPath) { - String taskId = Utilities.getTaskId(jc); - this.tmpPath = tmpPath; - this.taskTmpPath = taskTmpPath; - finalPath = new Path(tmpPath, taskId); - outPath = new Path(taskTmpPath, Utilities.toTempPath(taskId)); - } - - - /** - * Validates that each input path belongs to the same partition since each mapper merges the input - * to a single output directory - * @param inputPath - * @throws HiveException - */ - protected void checkPartitionsMatch(Path inputPath) throws HiveException { - if (!dpPath.equals(inputPath)) { - // Temp partition input path does not match exist temp path - String msg = "Multiple partitions for one block merge mapper: " + dpPath + " NOT EQUAL TO " - + inputPath; - LOG.error(msg); - throw new HiveException(msg); - } - } - - /** - * Fixes tmpPath to point to the correct partition. Before this is called, tmpPath will default to - * the root tmp table dir fixTmpPath(..) works for DP + LB + multiple skewed values + merge. - * reason: 1. fixTmpPath(..) compares inputPath and tmpDepth, find out path difference and put it - * into newPath. Then add newpath to existing this.tmpPath and this.taskTmpPath. 2. The path - * difference between inputPath and tmpDepth can be DP or DP+LB. It will automatically handle it. - * 3. For example, if inputpath is /-ext-10002/hr=a1/HIVE_DEFAULT_LIST_BUCKETING_DIR_NAME/ - * HIVE_DEFAULT_LIST_BUCKETING_DIR_NAME tmppath is /_tmp.-ext-10000 newpath will be - * hr=a1/HIVE_DEFAULT_LIST_BUCKETING_DIR_NAME/HIVE_DEFAULT_LIST_BUCKETING_DIR_NAME Then, - * this.tmpPath and this.taskTmpPath will be update correctly. We have list_bucket_dml_6.q cover - * this case: DP + LP + multiple skewed values + merge. - * @param inputPath - * @throws HiveException - * @throws IOException - */ - protected void fixTmpPath(Path inputPath) throws HiveException, IOException { - dpPath = inputPath; - Path newPath = new Path("."); - int inputDepth = inputPath.depth(); - int tmpDepth = tmpPath.depth(); - - // Build the path from bottom up - while (inputPath != null && inputDepth > tmpDepth) { - newPath = new Path(inputPath.getName(), newPath); - inputDepth--; - inputPath = inputPath.getParent(); - } - - Path newTmpPath = new Path(tmpPath, newPath); - Path newTaskTmpPath = new Path(taskTmpPath, newPath); - if (!fs.exists(newTmpPath)) { - fs.mkdirs(newTmpPath); - } - updatePaths(newTmpPath, newTaskTmpPath); - } - - /** - * Fixes tmpPath to point to the correct list bucketing sub-directories. Before this is called, - * tmpPath will default to the root tmp table dir Reason to add a new method instead of changing - * fixTmpPath() Reason 1: logic has slightly difference fixTmpPath(..) needs 2 variables in order - * to decide path delta which is in variable newPath. 1. inputPath.depth() 2. tmpPath.depth() - * fixTmpPathConcatenate needs 2 variables too but one of them is different from fixTmpPath(..) 1. - * inputPath.depth() 2. listBucketingDepth Reason 2: less risks The existing logic is a little not - * trivial around map() and fixTmpPath(). In order to ensure minimum impact on existing flow, we - * try to avoid change on existing code/flow but add new code for new feature. - * @param inputPath - * @throws HiveException - * @throws IOException - */ - protected void fixTmpPathConcatenate(Path inputPath) throws HiveException, IOException { - dpPath = inputPath; - Path newPath = new Path("."); - - int depth = listBucketingDepth; - // Build the path from bottom up. pick up list bucketing subdirectories - while ((inputPath != null) && (depth > 0)) { - newPath = new Path(inputPath.getName(), newPath); - inputPath = inputPath.getParent(); - depth--; - } - - Path newTmpPath = new Path(tmpPath, newPath); - Path newTaskTmpPath = new Path(taskTmpPath, newPath); - if (!fs.exists(newTmpPath)) { - fs.mkdirs(newTmpPath); - } - updatePaths(newTmpPath, newTaskTmpPath); - } - - @Override - public void close() throws IOException { - if (!exception) { - FileStatus fss = fs.getFileStatus(outPath); - LOG.info("renamed path " + outPath + " to " + finalPath + " . File size is " + fss.getLen()); - if (!fs.rename(outPath, finalPath)) { - throw new IOException("Unable to rename output to " + finalPath); - } - } else { - if (!autoDelete) { - fs.delete(outPath, true); - } - } - } - - protected void fixTmpPathAlterTable(Path path) throws IOException, HiveException { - /** - * 1. boolean isListBucketingAlterTableConcatenate will be true only if it is alter table ... - * concatenate on stored-as-dir so it will handle list bucketing alter table merge in the if - * cause with the help of fixTmpPathConcatenate 2. If it is DML, - * isListBucketingAlterTableConcatenate will be false so that it will be handled by else - * cause. In this else cause, we have another if check. 2.1 the if check will make sure DP or - * LB, we will fix path with the help of fixTmpPath(..). Since both has sub-directories. it - * includes SP + LB. 2.2 only SP without LB, we dont fix path. - */ - - // Fix temp path for alter table ... concatenate - if (isListBucketingAlterTableConcatenate) { - if (this.tmpPathFixedConcatenate) { - checkPartitionsMatch(path); - } else { - fixTmpPathConcatenate(path); - tmpPathFixedConcatenate = true; - } - } else { - if (hasDynamicPartitions || (listBucketingDepth > 0)) { - if (tmpPathFixed) { - checkPartitionsMatch(path); - } else { - // We haven't fixed the TMP path for this mapper yet - fixTmpPath(path); - tmpPathFixed = true; - } - } - } - } -} diff --git ql/src/java/org/apache/hadoop/hive/ql/io/merge/MergeOutputFormat.java ql/src/java/org/apache/hadoop/hive/ql/io/merge/MergeOutputFormat.java deleted file mode 100644 index a3ce699..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/io/merge/MergeOutputFormat.java +++ /dev/null @@ -1,45 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.io.merge; - -import java.io.IOException; - -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.mapred.FileOutputFormat; -import org.apache.hadoop.mapred.JobConf; -import org.apache.hadoop.mapred.RecordWriter; -import org.apache.hadoop.util.Progressable; - -public abstract class MergeOutputFormat extends - FileOutputFormat { - - public static void setMergeOutputPath(JobConf job, Path path) { - job.set("hive.merge.output.dir", path.toString()); - } - - public static Path getMergeOutputPath(JobConf conf) { - String name = conf.get("hive.merge.output.dir"); - return name == null ? null: new Path(name); - } - - public abstract RecordWriter getRecordWriter( - FileSystem ignored, JobConf job, String name, Progressable progress) - throws IOException; -} \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/io/merge/MergeTask.java ql/src/java/org/apache/hadoop/hive/ql/io/merge/MergeTask.java deleted file mode 100644 index c437dd0..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/io/merge/MergeTask.java +++ /dev/null @@ -1,415 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.io.merge; - -import java.io.IOException; -import java.io.Serializable; -import java.util.ArrayList; -import java.util.Collections; -import java.util.Enumeration; -import java.util.List; - -import org.apache.commons.lang.StringUtils; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.hive.conf.HiveConf; -import org.apache.hadoop.hive.ql.Context; -import org.apache.hadoop.hive.ql.DriverContext; -import org.apache.hadoop.hive.ql.QueryPlan; -import org.apache.hadoop.hive.ql.exec.Task; -import org.apache.hadoop.hive.ql.exec.Utilities; -import org.apache.hadoop.hive.ql.exec.mr.HadoopJobExecHelper; -import org.apache.hadoop.hive.ql.exec.mr.HadoopJobExecHook; -import org.apache.hadoop.hive.ql.exec.mr.Throttle; -import org.apache.hadoop.hive.ql.io.CombineHiveInputFormat; -import org.apache.hadoop.hive.ql.io.HiveOutputFormatImpl; -import org.apache.hadoop.hive.ql.io.RCFileInputFormat; -import org.apache.hadoop.hive.ql.io.orc.OrcInputFormat; -import org.apache.hadoop.hive.ql.metadata.HiveException; -import org.apache.hadoop.hive.ql.plan.DynamicPartitionCtx; -import org.apache.hadoop.hive.ql.plan.api.StageType; -import org.apache.hadoop.hive.ql.session.SessionState; -import org.apache.hadoop.hive.ql.session.SessionState.LogHelper; -import org.apache.hadoop.hive.shims.ShimLoader; -import org.apache.hadoop.io.NullWritable; -import org.apache.hadoop.mapred.Counters; -import org.apache.hadoop.mapred.FileInputFormat; -import org.apache.hadoop.mapred.InputFormat; -import org.apache.hadoop.mapred.JobClient; -import org.apache.hadoop.mapred.JobConf; -import org.apache.hadoop.mapred.Mapper; -import org.apache.hadoop.mapred.Reporter; -import org.apache.hadoop.mapred.RunningJob; -import org.apache.log4j.Appender; -import org.apache.log4j.FileAppender; -import org.apache.log4j.LogManager; - -public class MergeTask extends Task implements Serializable, - HadoopJobExecHook { - - private static final long serialVersionUID = 1L; - - public static String BACKUP_PREFIX = "_backup."; - - protected transient JobConf job; - protected HadoopJobExecHelper jobExecHelper; - - @Override - public void initialize(HiveConf conf, QueryPlan queryPlan, - DriverContext driverContext) { - super.initialize(conf, queryPlan, driverContext); - job = new JobConf(conf, MergeTask.class); - jobExecHelper = new HadoopJobExecHelper(job, this.console, this, this); - } - - @Override - public boolean requireLock() { - return true; - } - - boolean success = true; - - @Override - /** - * start a new map-reduce job to do the merge, almost the same as ExecDriver. - */ - public int execute(DriverContext driverContext) { - HiveConf.setVar(job, HiveConf.ConfVars.HIVEINPUTFORMAT, - CombineHiveInputFormat.class.getName()); - success = true; - ShimLoader.getHadoopShims().prepareJobOutput(job); - job.setOutputFormat(HiveOutputFormatImpl.class); - Class mapperClass = work.getMapperClass(work.getSourceTableInputFormat()); - LOG.info("Using " + mapperClass.getCanonicalName() + " mapper class."); - job.setMapperClass(mapperClass); - - Context ctx = driverContext.getCtx(); - boolean ctxCreated = false; - try { - if (ctx == null) { - ctx = new Context(job); - ctxCreated = true; - } - }catch (IOException e) { - e.printStackTrace(); - console.printError("Error launching map-reduce job", "\n" - + org.apache.hadoop.util.StringUtils.stringifyException(e)); - return 5; - } - - job.setMapOutputKeyClass(NullWritable.class); - job.setMapOutputValueClass(NullWritable.class); - if(work.getNumMapTasks() != null) { - job.setNumMapTasks(work.getNumMapTasks()); - } - - // zero reducers - job.setNumReduceTasks(0); - - if (work.getMinSplitSize() != null) { - HiveConf.setLongVar(job, HiveConf.ConfVars.MAPREDMINSPLITSIZE, work - .getMinSplitSize().longValue()); - } - - if (work.getInputformat() != null) { - HiveConf.setVar(job, HiveConf.ConfVars.HIVEINPUTFORMAT, work - .getInputformat()); - } - - String inpFormat = HiveConf.getVar(job, HiveConf.ConfVars.HIVEINPUTFORMAT); - if ((inpFormat == null) || (!StringUtils.isNotBlank(inpFormat))) { - inpFormat = ShimLoader.getHadoopShims().getInputFormatClassName(); - } - - LOG.info("Using " + inpFormat); - - try { - job.setInputFormat((Class) (Class - .forName(inpFormat))); - } catch (ClassNotFoundException e) { - throw new RuntimeException(e.getMessage()); - } - - Path outputPath = this.work.getOutputDir(); - Path tempOutPath = Utilities.toTempPath(outputPath); - try { - FileSystem fs = tempOutPath.getFileSystem(job); - if (!fs.exists(tempOutPath)) { - fs.mkdirs(tempOutPath); - } - } catch (IOException e) { - console.printError("Can't make path " + outputPath + " : " + e.getMessage()); - return 6; - } - - MergeOutputFormat.setMergeOutputPath(job, outputPath); - - job.setOutputKeyClass(NullWritable.class); - job.setOutputValueClass(NullWritable.class); - - HiveConf.setBoolVar(job, - HiveConf.ConfVars.HIVEMERGECURRENTJOBHASDYNAMICPARTITIONS, - work.hasDynamicPartitions()); - - HiveConf.setBoolVar(job, - HiveConf.ConfVars.HIVEMERGECURRENTJOBCONCATENATELISTBUCKETING, - work.isListBucketingAlterTableConcatenate()); - - HiveConf.setIntVar( - job, - HiveConf.ConfVars.HIVEMERGECURRENTJOBCONCATENATELISTBUCKETINGDEPTH, - ((work.getListBucketingCtx() == null) ? 0 : work.getListBucketingCtx() - .calculateListBucketingLevel())); - - int returnVal = 0; - RunningJob rj = null; - boolean noName = StringUtils.isEmpty(HiveConf.getVar(job, - HiveConf.ConfVars.HADOOPJOBNAME)); - - String jobName = null; - if (noName && this.getQueryPlan() != null) { - int maxlen = conf.getIntVar(HiveConf.ConfVars.HIVEJOBNAMELENGTH); - jobName = Utilities.abbreviate(this.getQueryPlan().getQueryStr(), - maxlen - 6); - } - - if (noName) { - // This is for a special case to ensure unit tests pass - HiveConf.setVar(job, HiveConf.ConfVars.HADOOPJOBNAME, - jobName != null ? jobName : "JOB" + Utilities.randGen.nextInt()); - } - - try { - addInputPaths(job, work); - - Utilities.setMapWork(job, work, ctx.getMRTmpPath(), true); - - // remove the pwd from conf file so that job tracker doesn't show this - // logs - String pwd = HiveConf.getVar(job, HiveConf.ConfVars.METASTOREPWD); - if (pwd != null) { - HiveConf.setVar(job, HiveConf.ConfVars.METASTOREPWD, "HIVE"); - } - JobClient jc = new JobClient(job); - - String addedJars = Utilities.getResourceFiles(job, SessionState.ResourceType.JAR); - if (!addedJars.isEmpty()) { - job.set("tmpjars", addedJars); - } - - // make this client wait if job trcker is not behaving well. - Throttle.checkJobTracker(job, LOG); - - // Finally SUBMIT the JOB! - rj = jc.submitJob(job); - - returnVal = jobExecHelper.progress(rj, jc, null); - success = (returnVal == 0); - - } catch (Exception e) { - e.printStackTrace(); - String mesg = " with exception '" + Utilities.getNameMessage(e) + "'"; - if (rj != null) { - mesg = "Ended Job = " + rj.getJobID() + mesg; - } else { - mesg = "Job Submission failed" + mesg; - } - - // Has to use full name to make sure it does not conflict with - // org.apache.commons.lang.StringUtils - console.printError(mesg, "\n" - + org.apache.hadoop.util.StringUtils.stringifyException(e)); - - success = false; - returnVal = 1; - } finally { - try { - if (ctxCreated) { - ctx.clear(); - } - if (rj != null) { - if (returnVal != 0) { - rj.killJob(); - } - HadoopJobExecHelper.runningJobs.remove(rj); - jobID = rj.getID().toString(); - } - closeJob(outputPath, success, job, console, work.getDynPartCtx(), null); - } catch (Exception e) { - } - } - - return (returnVal); - } - - private Path backupOutputPath(FileSystem fs, Path outpath, JobConf job) - throws IOException, HiveException { - if (fs.exists(outpath)) { - Path backupPath = new Path(outpath.getParent(), BACKUP_PREFIX - + outpath.getName()); - Utilities.rename(fs, outpath, backupPath); - return backupPath; - } else { - return null; - } - } - - private void closeJob(Path outputPath, boolean success, JobConf job, - LogHelper console, DynamicPartitionCtx dynPartCtx, Reporter reporter - ) throws HiveException, IOException { - FileSystem fs = outputPath.getFileSystem(job); - Path backupPath = backupOutputPath(fs, outputPath, job); - Utilities.mvFileToFinalPath(outputPath, job, success, LOG, dynPartCtx, null, - reporter); - fs.delete(backupPath, true); - } - - private void addInputPaths(JobConf job, MergeWork work) { - for (Path path : work.getInputPaths()) { - FileInputFormat.addInputPath(job, path); - } - } - - @Override - public String getName() { - return "MergeTask"; - } - - public static String INPUT_SEPERATOR = ":"; - - public static void main(String[] args) { - String inputPathStr = null; - String outputDir = null; - String jobConfFileName = null; - String format = null; - - try { - for (int i = 0; i < args.length; i++) { - if (args[i].equals("-input")) { - inputPathStr = args[++i]; - } else if (args[i].equals("-jobconffile")) { - jobConfFileName = args[++i]; - } else if (args[i].equals("-outputDir")) { - outputDir = args[++i]; - } else if (args[i].equals("-format")) { - format = args[++i]; - } - } - } catch (IndexOutOfBoundsException e) { - System.err.println("Missing argument to option"); - printUsage(); - } - - if (inputPathStr == null || outputDir == null - || outputDir.trim().equals("")) { - printUsage(); - } - - List inputPaths = new ArrayList(); - String[] paths = inputPathStr.split(INPUT_SEPERATOR); - if (paths == null || paths.length == 0) { - printUsage(); - } - - FileSystem fs = null; - JobConf conf = new JobConf(MergeTask.class); - for (String path : paths) { - try { - Path pathObj = new Path(path); - if (fs == null) { - fs = FileSystem.get(pathObj.toUri(), conf); - } - FileStatus fstatus = fs.getFileStatus(pathObj); - if (fstatus.isDir()) { - FileStatus[] fileStatus = fs.listStatus(pathObj); - for (FileStatus st : fileStatus) { - inputPaths.add(st.getPath()); - } - } else { - inputPaths.add(fstatus.getPath()); - } - } catch (IOException e) { - e.printStackTrace(System.err); - } - } - - if (jobConfFileName != null) { - conf.addResource(new Path(jobConfFileName)); - } - HiveConf hiveConf = new HiveConf(conf, MergeTask.class); - - Log LOG = LogFactory.getLog(MergeTask.class.getName()); - boolean isSilent = HiveConf.getBoolVar(conf, - HiveConf.ConfVars.HIVESESSIONSILENT); - LogHelper console = new LogHelper(LOG, isSilent); - - // print out the location of the log file for the user so - // that it's easy to find reason for local mode execution failures - for (Appender appender : Collections - .list((Enumeration) LogManager.getRootLogger() - .getAllAppenders())) { - if (appender instanceof FileAppender) { - console.printInfo("Execution log at: " - + ((FileAppender) appender).getFile()); - } - } - - MergeWork mergeWork = null; - if (format.equals("rcfile")) { - mergeWork = new MergeWork(inputPaths, new Path(outputDir), RCFileInputFormat.class); - } else if (format.equals("orcfile")) { - mergeWork = new MergeWork(inputPaths, new Path(outputDir), OrcInputFormat.class); - } - - DriverContext driverCxt = new DriverContext(); - MergeTask taskExec = new MergeTask(); - taskExec.initialize(hiveConf, null, driverCxt); - taskExec.setWork(mergeWork); - int ret = taskExec.execute(driverCxt); - - if (ret != 0) { - System.exit(2); - } - - } - - private static void printUsage() { - System.err.println("MergeTask -format -input " - + "-outputDir outputDir [-jobconffile ] "); - System.exit(1); - } - - @Override - public StageType getType() { - return StageType.MAPRED; - } - - @Override - public boolean checkFatalErrors(Counters ctrs, StringBuilder errMsg) { - return false; - } - - @Override - public void logPlanProgress(SessionState ss) throws IOException { - // no op - } -} diff --git ql/src/java/org/apache/hadoop/hive/ql/io/merge/MergeWork.java ql/src/java/org/apache/hadoop/hive/ql/io/merge/MergeWork.java deleted file mode 100644 index 9efee3c..0000000 --- ql/src/java/org/apache/hadoop/hive/ql/io/merge/MergeWork.java +++ /dev/null @@ -1,261 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.io.merge; - -import java.io.IOException; -import java.io.Serializable; -import java.util.ArrayList; -import java.util.LinkedHashMap; -import java.util.List; - -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.hive.common.HiveStatsUtils; -import org.apache.hadoop.hive.conf.HiveConf; -import org.apache.hadoop.hive.ql.io.CombineHiveInputFormat; -import org.apache.hadoop.hive.ql.io.RCFileInputFormat; -import org.apache.hadoop.hive.ql.io.orc.OrcFileMergeMapper; -import org.apache.hadoop.hive.ql.io.orc.OrcFileStripeMergeInputFormat; -import org.apache.hadoop.hive.ql.io.orc.OrcInputFormat; -import org.apache.hadoop.hive.ql.io.rcfile.merge.RCFileBlockMergeInputFormat; -import org.apache.hadoop.hive.ql.io.rcfile.merge.RCFileMergeMapper; -import org.apache.hadoop.hive.ql.plan.DynamicPartitionCtx; -import org.apache.hadoop.hive.ql.plan.Explain; -import org.apache.hadoop.hive.ql.plan.ListBucketingCtx; -import org.apache.hadoop.hive.ql.plan.MapWork; -import org.apache.hadoop.hive.ql.plan.PartitionDesc; -import org.apache.hadoop.hive.ql.plan.TableDesc; -import org.apache.hadoop.mapred.InputFormat; -import org.apache.hadoop.mapred.Mapper; - -@Explain(displayName = "Merge Work") -public class MergeWork extends MapWork implements Serializable { - - private static final long serialVersionUID = 1L; - - private transient List inputPaths; - private transient Path outputDir; - private boolean hasDynamicPartitions; - private DynamicPartitionCtx dynPartCtx; - private boolean isListBucketingAlterTableConcatenate; - private ListBucketingCtx listBucketingCtx; - private Class srcTblInputFormat; - - public MergeWork() { - } - - public MergeWork(List inputPaths, Path outputDir, - Class srcTblInputFormat) { - this(inputPaths, outputDir, false, null, srcTblInputFormat); - } - - public MergeWork(List inputPaths, Path outputDir, - boolean hasDynamicPartitions, DynamicPartitionCtx dynPartCtx, - Class srcTblInputFormat) { - super(); - this.inputPaths = inputPaths; - this.outputDir = outputDir; - this.hasDynamicPartitions = hasDynamicPartitions; - this.dynPartCtx = dynPartCtx; - this.srcTblInputFormat = srcTblInputFormat; - PartitionDesc partDesc = new PartitionDesc(); - if(srcTblInputFormat.equals(OrcInputFormat.class)) { - partDesc.setInputFileFormatClass(OrcFileStripeMergeInputFormat.class); - } else if(srcTblInputFormat.equals(RCFileInputFormat.class)) { - partDesc.setInputFileFormatClass(RCFileBlockMergeInputFormat.class); - } - if(this.getPathToPartitionInfo() == null) { - this.setPathToPartitionInfo(new LinkedHashMap()); - } - for(Path path: this.inputPaths) { - this.getPathToPartitionInfo().put(path.toString(), partDesc); - } - } - - public List getInputPaths() { - return inputPaths; - } - - public void setInputPaths(List inputPaths) { - this.inputPaths = inputPaths; - } - - public Path getOutputDir() { - return outputDir; - } - - public void setOutputDir(Path outputDir) { - this.outputDir = outputDir; - } - - public Class getMapperClass(Class klass) { - if (klass.equals(RCFileInputFormat.class)) { - return RCFileMergeMapper.class; - } else if (klass.equals(OrcInputFormat.class)) { - return OrcFileMergeMapper.class; - } - return null; - } - - @Override - public Long getMinSplitSize() { - return null; - } - - @Override - public String getInputformat() { - return CombineHiveInputFormat.class.getName(); - } - - @Override - public boolean isGatheringStats() { - return false; - } - - public boolean hasDynamicPartitions() { - return this.hasDynamicPartitions; - } - - public void setHasDynamicPartitions(boolean hasDynamicPartitions) { - this.hasDynamicPartitions = hasDynamicPartitions; - } - - @Override - public void resolveDynamicPartitionStoredAsSubDirsMerge(HiveConf conf, Path path, - TableDesc tblDesc, ArrayList aliases, PartitionDesc partDesc) { - - String inputFormatClass = null; - if (tblDesc.getInputFileFormatClass().equals(RCFileInputFormat.class)) { - inputFormatClass = conf.getVar(HiveConf.ConfVars.HIVEMERGEINPUTFORMATBLOCKLEVEL); - } else if (tblDesc.getInputFileFormatClass().equals(OrcInputFormat.class)){ - inputFormatClass = conf.getVar(HiveConf.ConfVars.HIVEMERGEINPUTFORMATSTRIPELEVEL); - } - - try { - partDesc.setInputFileFormatClass((Class ) - Class.forName(inputFormatClass)); - } catch (ClassNotFoundException e) { - String msg = "Merge input format class not found"; - throw new RuntimeException(msg); - } - super.resolveDynamicPartitionStoredAsSubDirsMerge(conf, path, tblDesc, aliases, partDesc); - - // Add the DP path to the list of input paths - inputPaths.add(path); - } - - /** - * alter table ... concatenate - * - * If it is skewed table, use subdirectories in inputpaths. - */ - public void resolveConcatenateMerge(HiveConf conf) { - isListBucketingAlterTableConcatenate = ((listBucketingCtx == null) ? false : listBucketingCtx - .isSkewedStoredAsDir()); - if (isListBucketingAlterTableConcatenate) { - // use sub-dir as inputpath. - assert ((this.inputPaths != null) && (this.inputPaths.size() == 1)) : - "alter table ... concatenate should only have one directory inside inputpaths"; - Path dirPath = inputPaths.get(0); - try { - FileSystem inpFs = dirPath.getFileSystem(conf); - FileStatus[] status = HiveStatsUtils.getFileStatusRecurse(dirPath, listBucketingCtx - .getSkewedColNames().size(), inpFs); - List newInputPath = new ArrayList(); - boolean succeed = true; - for (int i = 0; i < status.length; ++i) { - if (status[i].isDir()) { - // Add the lb path to the list of input paths - newInputPath.add(status[i].getPath()); - } else { - // find file instead of dir. dont change inputpath - succeed = false; - } - } - assert (succeed || ((!succeed) && newInputPath.isEmpty())) : "This partition has " - + " inconsistent file structure: " - + "it is stored-as-subdir and expected all files in the same depth of subdirectories."; - if (succeed) { - inputPaths.clear(); - inputPaths.addAll(newInputPath); - } - } catch (IOException e) { - String msg = "Fail to get filesystem for directory name : " + dirPath.toUri(); - throw new RuntimeException(msg, e); - } - - } - } - - public DynamicPartitionCtx getDynPartCtx() { - return dynPartCtx; - } - - public void setDynPartCtx(DynamicPartitionCtx dynPartCtx) { - this.dynPartCtx = dynPartCtx; - } - - /** - * @return the listBucketingCtx - */ - public ListBucketingCtx getListBucketingCtx() { - return listBucketingCtx; - } - - /** - * @param listBucketingCtx the listBucketingCtx to set - */ - public void setListBucketingCtx(ListBucketingCtx listBucketingCtx) { - this.listBucketingCtx = listBucketingCtx; - } - - /** - * @return the isListBucketingAlterTableConcatenate - */ - public boolean isListBucketingAlterTableConcatenate() { - return isListBucketingAlterTableConcatenate; - } - - public Class getSourceTableInputFormat() { - return srcTblInputFormat; - } - - @Explain(displayName = "input format") - public String getStringifiedInputFormat() { - return srcTblInputFormat.getCanonicalName(); - } - - @Explain(displayName = "merge level") - public String getMergeLevel() { - if (srcTblInputFormat != null) { - if (srcTblInputFormat.equals(OrcInputFormat.class)) { - return "stripe"; - } else if (srcTblInputFormat.equals(RCFileInputFormat.class)) { - return "block"; - } - } - return null; - } - - public void setSourceTableInputFormat(Class srcTblInputFormat) { - this.srcTblInputFormat = srcTblInputFormat; - } - -} diff --git ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcFileMergeMapper.java ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcFileMergeMapper.java index b36152a..3adfd5f 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcFileMergeMapper.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcFileMergeMapper.java @@ -18,24 +18,24 @@ package org.apache.hadoop.hive.ql.io.orc; -import java.io.IOException; -import java.util.List; - import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.Path; -import org.apache.hadoop.hive.ql.io.merge.MergeMapper; +import org.apache.hadoop.hive.ql.io.merge.MergeFileMapper; import org.apache.hadoop.hive.shims.CombineHiveKey; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.Mapper; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.Reporter; +import java.io.IOException; +import java.util.List; + /** * Map task fast merging of ORC files. */ -public class OrcFileMergeMapper extends MergeMapper implements +public class OrcFileMergeMapper extends MergeFileMapper implements Mapper { // These parameters must match for all orc files involved in merging @@ -94,6 +94,7 @@ public void map(Object key, OrcFileValueWrapper value, OutputCollector getRecordReader( diff --git ql/src/java/org/apache/hadoop/hive/ql/io/orc/Writer.java ql/src/java/org/apache/hadoop/hive/ql/io/orc/Writer.java index c391b0e..6411e3f 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/orc/Writer.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/Writer.java @@ -20,6 +20,7 @@ import java.io.IOException; import java.nio.ByteBuffer; +import java.util.List; /** * The interface for writing ORC files. @@ -72,4 +73,30 @@ * @return the offset that would be a valid end location for an ORC file */ long writeIntermediateFooter() throws IOException; + + /** + * Fast stripe append to ORC file. This interface is used for fast ORC file + * merge with other ORC files. When merging, the file to be merged should pass + * stripe in binary form along with stripe information and stripe statistics. + * After appending last stripe of a file, use appendUserMetadata() to append + * any user metadata. + * @param stripe - stripe as byte array + * @param offset - offset within byte array + * @param length - length of stripe within byte array + * @param stripeInfo - stripe information + * @param stripeStatistics - stripe statistics (Protobuf objects can be + * merged directly) + * @throws IOException + */ + public void appendStripe(byte[] stripe, int offset, int length, + StripeInformation stripeInfo, + OrcProto.StripeStatistics stripeStatistics) throws IOException; + + /** + * When fast stripe append is used for merging ORC stripes, after appending + * the last stripe from a file, this interface must be used to merge any + * user metadata. + * @param userMetadata - user metadata + */ + public void appendUserMetadata(List userMetadata); } diff --git ql/src/java/org/apache/hadoop/hive/ql/io/orc/WriterImpl.java ql/src/java/org/apache/hadoop/hive/ql/io/orc/WriterImpl.java index 76b4d03..18c2bdb 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/orc/WriterImpl.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/WriterImpl.java @@ -18,16 +18,10 @@ package org.apache.hadoop.hive.ql.io.orc; -import java.io.IOException; -import java.io.OutputStream; -import java.lang.management.ManagementFactory; -import java.nio.ByteBuffer; -import java.sql.Timestamp; -import java.util.ArrayList; -import java.util.List; -import java.util.Map; -import java.util.TreeMap; - +import com.google.common.annotations.VisibleForTesting; +import com.google.common.collect.Lists; +import com.google.protobuf.ByteString; +import com.google.protobuf.CodedOutputStream; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; @@ -71,10 +65,17 @@ import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.Text; -import com.google.common.annotations.VisibleForTesting; -import com.google.common.collect.Lists; -import com.google.protobuf.ByteString; -import com.google.protobuf.CodedOutputStream; +import java.io.IOException; +import java.io.OutputStream; +import java.lang.management.ManagementFactory; +import java.nio.ByteBuffer; +import java.sql.Timestamp; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.TreeMap; + +import static com.google.common.base.Preconditions.checkArgument; /** * An ORC file writer. The file is divided into stripes, which is the natural @@ -2218,17 +2219,19 @@ public synchronized long writeIntermediateFooter() throws IOException { return rawWriter.getPos(); } - void appendStripe(byte[] stripe, StripeInformation stripeInfo, - OrcProto.StripeStatistics stripeStatistics) throws IOException { - appendStripe(stripe, 0, stripe.length, stripeInfo, stripeStatistics); - } - - void appendStripe(byte[] stripe, int offset, int length, + @Override + public void appendStripe(byte[] stripe, int offset, int length, StripeInformation stripeInfo, OrcProto.StripeStatistics stripeStatistics) throws IOException { + checkArgument(stripe != null, "Stripe must not be null"); + checkArgument(length <= stripe.length, + "Specified length must not be greater specified array length"); + checkArgument(stripeInfo != null, "Stripe information must not be null"); + checkArgument(stripeStatistics != null, + "Stripe statistics must not be null"); + getStream(); long start = rawWriter.getPos(); - long stripeLen = length; long availBlockSpace = blockSize - (start % blockSize); @@ -2284,7 +2287,8 @@ private void updateFileStatistics(OrcProto.StripeStatistics stripeStatistics) { } } - void appendUserMetadata(List userMetadata) { + @Override + public void appendUserMetadata(List userMetadata) { if (userMetadata != null) { for (UserMetadataItem item : userMetadata) { this.userMetadata.put(item.getName(), item.getValue()); diff --git ql/src/java/org/apache/hadoop/hive/ql/io/rcfile/merge/RCFileBlockMergeInputFormat.java ql/src/java/org/apache/hadoop/hive/ql/io/rcfile/merge/RCFileBlockMergeInputFormat.java index 6809c79..4df80ea 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/rcfile/merge/RCFileBlockMergeInputFormat.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/rcfile/merge/RCFileBlockMergeInputFormat.java @@ -20,14 +20,14 @@ import java.io.IOException; -import org.apache.hadoop.hive.ql.io.merge.MergeInputFormat; +import org.apache.hadoop.hive.ql.io.merge.MergeFileInputFormat; import org.apache.hadoop.mapred.FileSplit; import org.apache.hadoop.mapred.InputSplit; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.RecordReader; import org.apache.hadoop.mapred.Reporter; -public class RCFileBlockMergeInputFormat extends MergeInputFormat { +public class RCFileBlockMergeInputFormat extends MergeFileInputFormat { @Override public RecordReader diff --git ql/src/java/org/apache/hadoop/hive/ql/io/rcfile/merge/RCFileMergeMapper.java ql/src/java/org/apache/hadoop/hive/ql/io/rcfile/merge/RCFileMergeMapper.java index dee6b1c..cab141f 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/rcfile/merge/RCFileMergeMapper.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/rcfile/merge/RCFileMergeMapper.java @@ -24,7 +24,7 @@ import org.apache.commons.logging.LogFactory; import org.apache.hadoop.hive.ql.io.RCFile; import org.apache.hadoop.hive.ql.io.RCFileOutputFormat; -import org.apache.hadoop.hive.ql.io.merge.MergeMapper; +import org.apache.hadoop.hive.ql.io.merge.MergeFileMapper; import org.apache.hadoop.hive.shims.CombineHiveKey; import org.apache.hadoop.io.compress.CompressionCodec; import org.apache.hadoop.mapred.Mapper; @@ -32,7 +32,7 @@ import org.apache.hadoop.mapred.Reporter; @SuppressWarnings("deprecation") -public class RCFileMergeMapper extends MergeMapper implements +public class RCFileMergeMapper extends MergeFileMapper implements Mapper { RCFile.Writer outWriter; diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMapRedUtils.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMapRedUtils.java index 7129ed8..63502bb 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMapRedUtils.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/GenMapRedUtils.java @@ -65,7 +65,7 @@ import org.apache.hadoop.hive.ql.exec.mr.MapRedTask; import org.apache.hadoop.hive.ql.hooks.ReadEntity; import org.apache.hadoop.hive.ql.io.RCFileInputFormat; -import org.apache.hadoop.hive.ql.io.merge.MergeWork; +import org.apache.hadoop.hive.ql.io.merge.MergeFileWork; import org.apache.hadoop.hive.ql.io.orc.OrcInputFormat; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.metadata.Partition; @@ -1250,7 +1250,7 @@ public static void createMRWorkForMergingFiles (FileSinkOperator fsInput, (conf.getBoolVar(ConfVars.HIVEMERGEORCFILESTRIPELEVEL) && fsInputDesc.getTableInfo().getInputFileFormatClass().equals(OrcInputFormat.class))) { - // Check if InputFormatClass is valid + // Compile time check to see if InputFormatClass is valid final String inputFormatClass; if (fsInputDesc.getTableInfo().getInputFileFormatClass().equals(RCFileInputFormat.class)) { inputFormatClass = conf.getVar(ConfVars.HIVEMERGEINPUTFORMATBLOCKLEVEL); @@ -1259,24 +1259,24 @@ public static void createMRWorkForMergingFiles (FileSinkOperator fsInput, } try { Class c = Class.forName(inputFormatClass); - - if(fsInputDesc.getTableInfo().getInputFileFormatClass().equals(OrcInputFormat.class)) { - LOG.info("OrcFile format - Using stripe level merge"); - } else { - LOG.info("RCFile format- Using block level merge"); - } - cplan = GenMapRedUtils.createMergeTask(fsInputDesc, finalName, - dpCtx != null && dpCtx.getNumDPCols() > 0); - work = cplan; } catch (ClassNotFoundException e) { String msg = "Illegal input format class: " + inputFormatClass; throw new SemanticException(msg); } + cplan = GenMapRedUtils.createMergeTask(fsInputDesc, finalName, + dpCtx != null && dpCtx.getNumDPCols() > 0); + if (conf.getVar(ConfVars.HIVE_EXECUTION_ENGINE).equals("tez")) { + work = new TezWork(conf.getVar(HiveConf.ConfVars.HIVEQUERYID)); + cplan.setName("MergeFileWork"); + ((TezWork) work).add(cplan); + } else { + work = cplan; + } } else { cplan = createMRWorkForMergingFiles(conf, tsMerge, fsInputDesc); if (conf.getVar(ConfVars.HIVE_EXECUTION_ENGINE).equals("tez")) { work = new TezWork(conf.getVar(HiveConf.ConfVars.HIVEQUERYID)); - cplan.setName("Merge"); + cplan.setName("MergeFileWork"); ((TezWork)work).add(cplan); } else { work = new MapredWork(); @@ -1508,9 +1508,9 @@ public static MapWork createMergeTask(FileSinkDesc fsInputDesc, inputDirstr.add(inputDir.toString()); } - MergeWork work = new MergeWork(inputDirs, finalName, + MergeFileWork work = new MergeFileWork(inputDirs, finalName, hasDynamicPartitions, fsInputDesc.getDynPartCtx(), - tblDesc.getInputFileFormatClass()); + tblDesc.getInputFileFormatClass().getName()); LinkedHashMap> pathToAliases = new LinkedHashMap>(); pathToAliases.put(inputDir.toString(), (ArrayList) inputDirstr.clone()); diff --git ql/src/java/org/apache/hadoop/hive/ql/parse/GenTezProcContext.java ql/src/java/org/apache/hadoop/hive/ql/parse/GenTezProcContext.java index 8513f99..6255489 100644 --- ql/src/java/org/apache/hadoop/hive/ql/parse/GenTezProcContext.java +++ ql/src/java/org/apache/hadoop/hive/ql/parse/GenTezProcContext.java @@ -26,29 +26,26 @@ import java.util.Map; import java.util.Set; -import org.apache.commons.lang3.tuple.ImmutablePair; -import org.apache.commons.lang3.tuple.Pair; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.ql.exec.DependencyCollectionTask; import org.apache.hadoop.hive.ql.exec.FileSinkOperator; import org.apache.hadoop.hive.ql.exec.MapJoinOperator; import org.apache.hadoop.hive.ql.exec.Operator; -import org.apache.hadoop.hive.ql.exec.UnionOperator; import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator; import org.apache.hadoop.hive.ql.exec.Task; import org.apache.hadoop.hive.ql.exec.TaskFactory; +import org.apache.hadoop.hive.ql.exec.UnionOperator; import org.apache.hadoop.hive.ql.exec.tez.TezTask; import org.apache.hadoop.hive.ql.hooks.ReadEntity; import org.apache.hadoop.hive.ql.hooks.WriteEntity; import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx; import org.apache.hadoop.hive.ql.plan.BaseWork; import org.apache.hadoop.hive.ql.plan.DependencyCollectionWork; +import org.apache.hadoop.hive.ql.plan.FileSinkDesc; import org.apache.hadoop.hive.ql.plan.MoveWork; import org.apache.hadoop.hive.ql.plan.OperatorDesc; -import org.apache.hadoop.hive.ql.plan.FileSinkDesc; import org.apache.hadoop.hive.ql.plan.TezEdgeProperty; -import org.apache.hadoop.hive.ql.plan.TezEdgeProperty.EdgeType; import org.apache.hadoop.hive.ql.plan.TezWork; /** diff --git ql/src/java/org/apache/hadoop/hive/ql/parse/TezCompiler.java ql/src/java/org/apache/hadoop/hive/ql/parse/TezCompiler.java index d58c59d..cc42c5a 100644 --- ql/src/java/org/apache/hadoop/hive/ql/parse/TezCompiler.java +++ ql/src/java/org/apache/hadoop/hive/ql/parse/TezCompiler.java @@ -223,6 +223,9 @@ protected void setInputFormat(Task task) { } private void setInputFormat(MapWork work, Operator op) { + if (op == null) { + return; + } if (op.isUseBucketizedHiveInputFormat()) { work.setUseBucketizedHiveInputFormat(true); return; diff --git ql/src/test/queries/clientpositive/orc_merge5.q ql/src/test/queries/clientpositive/orc_merge5.q new file mode 100644 index 0000000..ff514b9 --- /dev/null +++ ql/src/test/queries/clientpositive/orc_merge5.q @@ -0,0 +1,61 @@ +-- SORT_QUERY_RESULTS + +create table orc_merge5 (userid bigint, string1 string, subtype double, decimal1 decimal, ts timestamp) stored as orc; +create table orc_merge5b (userid bigint, string1 string, subtype double, decimal1 decimal, ts timestamp) stored as orc; + +load data local inpath '../../data/files/orc_split_elim.orc' into table orc_merge5; + +SET hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat; +SET mapred.min.split.size=1000; +SET mapred.max.split.size=50000; +SET hive.optimize.index.filter=true; +set hive.merge.orcfile.stripe.level=false; +set hive.merge.tezfiles=false; +set hive.merge.mapfiles=false; +set hive.merge.mapredfiles=false; +set hive.compute.splits.in.am=true; +set tez.am.grouping.min-size=1000; +set tez.am.grouping.max-size=50000; + +-- 3 mappers +explain insert overwrite table orc_merge5b select userid,string1,subtype,decimal1,ts from orc_merge5 where userid<=13; +insert overwrite table orc_merge5b select userid,string1,subtype,decimal1,ts from orc_merge5 where userid<=13; + +-- 3 files total +analyze table orc_merge5b compute statistics noscan; +desc formatted orc_merge5b; +select * from orc_merge5b; + +set hive.merge.orcfile.stripe.level=true; +set hive.merge.tezfiles=true; +set hive.merge.mapfiles=true; +set hive.merge.mapredfiles=true; + +-- 3 mappers +explain insert overwrite table orc_merge5b select userid,string1,subtype,decimal1,ts from orc_merge5 where userid<=13; +insert overwrite table orc_merge5b select userid,string1,subtype,decimal1,ts from orc_merge5 where userid<=13; + +-- 1 file after merging +analyze table orc_merge5b compute statistics noscan; +desc formatted orc_merge5b; +select * from orc_merge5b; + +set hive.merge.orcfile.stripe.level=false; +set hive.merge.tezfiles=false; +set hive.merge.mapfiles=false; +set hive.merge.mapredfiles=false; + +insert overwrite table orc_merge5b select userid,string1,subtype,decimal1,ts from orc_merge5 where userid<=13; +analyze table orc_merge5b compute statistics noscan; +desc formatted orc_merge5b; +select * from orc_merge5b; + +set hive.merge.orcfile.stripe.level=true; +explain alter table orc_merge5b concatenate; +alter table orc_merge5b concatenate; + +-- 1 file after merging +analyze table orc_merge5b compute statistics noscan; +desc formatted orc_merge5b; +select * from orc_merge5b; + diff --git ql/src/test/queries/clientpositive/orc_merge6.q ql/src/test/queries/clientpositive/orc_merge6.q new file mode 100644 index 0000000..76d5ca2 --- /dev/null +++ ql/src/test/queries/clientpositive/orc_merge6.q @@ -0,0 +1,78 @@ +-- SORT_QUERY_RESULTS + +-- orc file merge tests for static partitions +create table orc_merge5 (userid bigint, string1 string, subtype double, decimal1 decimal, ts timestamp) stored as orc; +create table orc_merge5a (userid bigint, string1 string, subtype double, decimal1 decimal, ts timestamp) partitioned by (year string, hour int) stored as orc; + +load data local inpath '../../data/files/orc_split_elim.orc' into table orc_merge5; + +SET hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat; +SET mapred.min.split.size=1000; +SET mapred.max.split.size=50000; +SET hive.optimize.index.filter=true; +set hive.merge.orcfile.stripe.level=false; +set hive.merge.tezfiles=false; +set hive.merge.mapfiles=false; +set hive.merge.mapredfiles=false; +set hive.compute.splits.in.am=true; +set tez.am.grouping.min-size=1000; +set tez.am.grouping.max-size=50000; + +-- 3 mappers +explain insert overwrite table orc_merge5a partition (year="2000",hour=24) select userid,string1,subtype,decimal1,ts from orc_merge5 where userid<=13; +insert overwrite table orc_merge5a partition (year="2000",hour=24) select userid,string1,subtype,decimal1,ts from orc_merge5 where userid<=13; +insert overwrite table orc_merge5a partition (year="2001",hour=24) select userid,string1,subtype,decimal1,ts from orc_merge5 where userid<=13; + +-- 3 files total +analyze table orc_merge5a partition(year="2000",hour=24) compute statistics noscan; +analyze table orc_merge5a partition(year="2001",hour=24) compute statistics noscan; +desc formatted orc_merge5a partition(year="2000",hour=24); +desc formatted orc_merge5a partition(year="2001",hour=24); +show partitions orc_merge5a; +select * from orc_merge5a; + +set hive.merge.orcfile.stripe.level=true; +set hive.merge.tezfiles=true; +set hive.merge.mapfiles=true; +set hive.merge.mapredfiles=true; + +-- 3 mappers +explain insert overwrite table orc_merge5a partition (year="2000",hour=24) select userid,string1,subtype,decimal1,ts from orc_merge5 where userid<=13; +insert overwrite table orc_merge5a partition (year="2000",hour=24) select userid,string1,subtype,decimal1,ts from orc_merge5 where userid<=13; +insert overwrite table orc_merge5a partition (year="2001",hour=24) select userid,string1,subtype,decimal1,ts from orc_merge5 where userid<=13; + +-- 1 file after merging +analyze table orc_merge5a partition(year="2000",hour=24) compute statistics noscan; +analyze table orc_merge5a partition(year="2001",hour=24) compute statistics noscan; +desc formatted orc_merge5a partition(year="2000",hour=24); +desc formatted orc_merge5a partition(year="2001",hour=24); +show partitions orc_merge5a; +select * from orc_merge5a; + +set hive.merge.orcfile.stripe.level=false; +set hive.merge.tezfiles=false; +set hive.merge.mapfiles=false; +set hive.merge.mapredfiles=false; + +insert overwrite table orc_merge5a partition (year="2000",hour=24) select userid,string1,subtype,decimal1,ts from orc_merge5 where userid<=13; +insert overwrite table orc_merge5a partition (year="2001",hour=24) select userid,string1,subtype,decimal1,ts from orc_merge5 where userid<=13; +analyze table orc_merge5a partition(year="2000",hour=24) compute statistics noscan; +analyze table orc_merge5a partition(year="2001",hour=24) compute statistics noscan; +desc formatted orc_merge5a partition(year="2000",hour=24); +desc formatted orc_merge5a partition(year="2001",hour=24); +show partitions orc_merge5a; +select * from orc_merge5a; + +set hive.merge.orcfile.stripe.level=true; +explain alter table orc_merge5a partition(year="2000",hour=24) concatenate; +alter table orc_merge5a partition(year="2000",hour=24) concatenate; +alter table orc_merge5a partition(year="2001",hour=24) concatenate; + +-- 1 file after merging +analyze table orc_merge5a partition(year="2000",hour=24) compute statistics noscan; +analyze table orc_merge5a partition(year="2001",hour=24) compute statistics noscan; +desc formatted orc_merge5a partition(year="2000",hour=24); +desc formatted orc_merge5a partition(year="2001",hour=24); +show partitions orc_merge5a; +select * from orc_merge5a; + diff --git ql/src/test/queries/clientpositive/orc_merge7.q ql/src/test/queries/clientpositive/orc_merge7.q new file mode 100644 index 0000000..2a7fc69 --- /dev/null +++ ql/src/test/queries/clientpositive/orc_merge7.q @@ -0,0 +1,82 @@ +-- SORT_QUERY_RESULTS + +-- orc merge file tests for dynamic partition case + +create table orc_merge5 (userid bigint, string1 string, subtype double, decimal1 decimal, ts timestamp) stored as orc; +create table orc_merge5a (userid bigint, string1 string, subtype double, decimal1 decimal, ts timestamp) partitioned by (st double) stored as orc; + +load data local inpath '../../data/files/orc_split_elim.orc' into table orc_merge5; + +SET hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat; +SET mapred.min.split.size=1000; +SET mapred.max.split.size=50000; +SET hive.optimize.index.filter=true; +set hive.merge.orcfile.stripe.level=false; +set hive.merge.tezfiles=false; +set hive.merge.mapfiles=false; +set hive.merge.mapredfiles=false; +set hive.compute.splits.in.am=true; +set tez.am.grouping.min-size=1000; +set tez.am.grouping.max-size=50000; +set hive.exec.dynamic.partition=true; +set hive.exec.dynamic.partition.mode=nonstrict; +set hive.optimize.sort.dynamic.partition=false; + +-- 3 mappers +explain insert overwrite table orc_merge5a partition (st) select userid,string1,subtype,decimal1,ts,subtype from orc_merge5; +insert overwrite table orc_merge5a partition (st) select userid,string1,subtype,decimal1,ts,subtype from orc_merge5; +insert overwrite table orc_merge5a partition (st) select userid,string1,subtype,decimal1,ts,subtype from orc_merge5; + +-- 3 files total +analyze table orc_merge5a partition(st=80.0) compute statistics noscan; +analyze table orc_merge5a partition(st=0.8) compute statistics noscan; +desc formatted orc_merge5a partition(st=80.0); +desc formatted orc_merge5a partition(st=0.8); +show partitions orc_merge5a; +select * from orc_merge5a where userid<=13; + +set hive.merge.orcfile.stripe.level=true; +set hive.merge.tezfiles=true; +set hive.merge.mapfiles=true; +set hive.merge.mapredfiles=true; + +-- 3 mappers +explain insert overwrite table orc_merge5a partition (st) select userid,string1,subtype,decimal1,ts,subtype from orc_merge5; +insert overwrite table orc_merge5a partition (st) select userid,string1,subtype,decimal1,ts,subtype from orc_merge5; +insert overwrite table orc_merge5a partition (st) select userid,string1,subtype,decimal1,ts,subtype from orc_merge5; + +-- 1 file after merging +analyze table orc_merge5a partition(st=80.0) compute statistics noscan; +analyze table orc_merge5a partition(st=0.8) compute statistics noscan; +desc formatted orc_merge5a partition(st=80.0); +desc formatted orc_merge5a partition(st=0.8); +show partitions orc_merge5a; +select * from orc_merge5a where userid<=13; + +set hive.merge.orcfile.stripe.level=false; +set hive.merge.tezfiles=false; +set hive.merge.mapfiles=false; +set hive.merge.mapredfiles=false; + +insert overwrite table orc_merge5a partition (st) select userid,string1,subtype,decimal1,ts,subtype from orc_merge5; +insert overwrite table orc_merge5a partition (st) select userid,string1,subtype,decimal1,ts,subtype from orc_merge5; +analyze table orc_merge5a partition(st=80.0) compute statistics noscan; +analyze table orc_merge5a partition(st=0.8) compute statistics noscan; +desc formatted orc_merge5a partition(st=80.0); +desc formatted orc_merge5a partition(st=0.8); +show partitions orc_merge5a; +select * from orc_merge5a where userid<=13; + +set hive.merge.orcfile.stripe.level=true; +explain alter table orc_merge5a partition(st=80.0) concatenate; +alter table orc_merge5a partition(st=80.0) concatenate; +alter table orc_merge5a partition(st=0.8) concatenate; + +-- 1 file after merging +analyze table orc_merge5a partition(st=80.0) compute statistics noscan; +analyze table orc_merge5a partition(st=0.8) compute statistics noscan; +desc formatted orc_merge5a partition(st=80.0); +desc formatted orc_merge5a partition(st=0.8); +show partitions orc_merge5a; +select * from orc_merge5a where userid<=13; + diff --git ql/src/test/results/clientpositive/tez/orc_merge5.q.out ql/src/test/results/clientpositive/tez/orc_merge5.q.out new file mode 100644 index 0000000..935f158 --- /dev/null +++ ql/src/test/results/clientpositive/tez/orc_merge5.q.out @@ -0,0 +1,476 @@ +PREHOOK: query: -- SORT_QUERY_RESULTS + +create table orc_merge5 (userid bigint, string1 string, subtype double, decimal1 decimal, ts timestamp) stored as orc +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +POSTHOOK: query: -- SORT_QUERY_RESULTS + +create table orc_merge5 (userid bigint, string1 string, subtype double, decimal1 decimal, ts timestamp) stored as orc +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@orc_merge5 +PREHOOK: query: create table orc_merge5b (userid bigint, string1 string, subtype double, decimal1 decimal, ts timestamp) stored as orc +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +POSTHOOK: query: create table orc_merge5b (userid bigint, string1 string, subtype double, decimal1 decimal, ts timestamp) stored as orc +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@orc_merge5b +PREHOOK: query: load data local inpath '../../data/files/orc_split_elim.orc' into table orc_merge5 +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@orc_merge5 +POSTHOOK: query: load data local inpath '../../data/files/orc_split_elim.orc' into table orc_merge5 +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@orc_merge5 +PREHOOK: query: -- 3 mappers +explain insert overwrite table orc_merge5b select userid,string1,subtype,decimal1,ts from orc_merge5 where userid<=13 +PREHOOK: type: QUERY +POSTHOOK: query: -- 3 mappers +explain insert overwrite table orc_merge5b select userid,string1,subtype,decimal1,ts from orc_merge5 where userid<=13 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + Stage-3 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: orc_merge5 + filterExpr: (userid <= 13) (type: boolean) + Statistics: Num rows: 919 Data size: 246402 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (userid <= 13) (type: boolean) + Statistics: Num rows: 306 Data size: 82044 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: userid (type: bigint), string1 (type: string), subtype (type: double), decimal1 (type: decimal(10,0)), ts (type: timestamp) + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Statistics: Num rows: 306 Data size: 82044 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 306 Data size: 82044 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + name: default.orc_merge5b + + Stage: Stage-2 + Dependency Collection + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + name: default.orc_merge5b + + Stage: Stage-3 + Stats-Aggr Operator + +PREHOOK: query: insert overwrite table orc_merge5b select userid,string1,subtype,decimal1,ts from orc_merge5 where userid<=13 +PREHOOK: type: QUERY +PREHOOK: Input: default@orc_merge5 +PREHOOK: Output: default@orc_merge5b +POSTHOOK: query: insert overwrite table orc_merge5b select userid,string1,subtype,decimal1,ts from orc_merge5 where userid<=13 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@orc_merge5 +POSTHOOK: Output: default@orc_merge5b +POSTHOOK: Lineage: orc_merge5b.decimal1 SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:decimal1, type:decimal(10,0), comment:null), ] +POSTHOOK: Lineage: orc_merge5b.string1 SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:string1, type:string, comment:null), ] +POSTHOOK: Lineage: orc_merge5b.subtype SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:subtype, type:double, comment:null), ] +POSTHOOK: Lineage: orc_merge5b.ts SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:ts, type:timestamp, comment:null), ] +POSTHOOK: Lineage: orc_merge5b.userid SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:userid, type:bigint, comment:null), ] +PREHOOK: query: -- 3 files total +analyze table orc_merge5b compute statistics noscan +PREHOOK: type: QUERY +PREHOOK: Output: default@orc_merge5b +POSTHOOK: query: -- 3 files total +analyze table orc_merge5b compute statistics noscan +POSTHOOK: type: QUERY +POSTHOOK: Output: default@orc_merge5b +PREHOOK: query: desc formatted orc_merge5b +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@orc_merge5b +POSTHOOK: query: desc formatted orc_merge5b +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@orc_merge5b +# col_name data_type comment + +userid bigint +string1 string +subtype double +decimal1 decimal(10,0) +ts timestamp + +# Detailed Table Information +Database: default +#### A masked pattern was here #### +Protect Mode: None +Retention: 0 +#### A masked pattern was here #### +Table Type: MANAGED_TABLE +Table Parameters: + COLUMN_STATS_ACCURATE true + numFiles 3 + numRows 3 + rawDataSize 765 + totalSize 1088 +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.ql.io.orc.OrcSerde +InputFormat: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: select * from orc_merge5b +PREHOOK: type: QUERY +PREHOOK: Input: default@orc_merge5b +#### A masked pattern was here #### +POSTHOOK: query: select * from orc_merge5b +POSTHOOK: type: QUERY +POSTHOOK: Input: default@orc_merge5b +#### A masked pattern was here #### +13 bar 80.0 2 1969-12-31 16:00:05 +2 foo 0.8 1 1969-12-31 16:00:00 +5 eat 0.8 6 1969-12-31 16:00:20 +PREHOOK: query: -- 3 mappers +explain insert overwrite table orc_merge5b select userid,string1,subtype,decimal1,ts from orc_merge5 where userid<=13 +PREHOOK: type: QUERY +POSTHOOK: query: -- 3 mappers +explain insert overwrite table orc_merge5b select userid,string1,subtype,decimal1,ts from orc_merge5 where userid<=13 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-8 depends on stages: Stage-1 , consists of Stage-5, Stage-4, Stage-6 + Stage-5 + Stage-2 depends on stages: Stage-5, Stage-4, Stage-7 + Stage-0 depends on stages: Stage-2 + Stage-3 depends on stages: Stage-0 + Stage-4 + Stage-6 + Stage-7 depends on stages: Stage-6 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: orc_merge5 + filterExpr: (userid <= 13) (type: boolean) + Statistics: Num rows: 919 Data size: 246402 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (userid <= 13) (type: boolean) + Statistics: Num rows: 306 Data size: 82044 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: userid (type: bigint), string1 (type: string), subtype (type: double), decimal1 (type: decimal(10,0)), ts (type: timestamp) + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Statistics: Num rows: 306 Data size: 82044 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 306 Data size: 82044 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + name: default.orc_merge5b + + Stage: Stage-8 + Conditional Operator + + Stage: Stage-5 + Move Operator + files: + hdfs directory: true +#### A masked pattern was here #### + + Stage: Stage-2 + Dependency Collection + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + name: default.orc_merge5b + + Stage: Stage-3 + Stats-Aggr Operator + + Stage: Stage-4 + Tez +#### A masked pattern was here #### + Vertices: + MergeFileWork + Merge File Work + merge level: stripe + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + + Stage: Stage-6 + Tez +#### A masked pattern was here #### + Vertices: + MergeFileWork + Merge File Work + merge level: stripe + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + + Stage: Stage-7 + Move Operator + files: + hdfs directory: true +#### A masked pattern was here #### + +PREHOOK: query: insert overwrite table orc_merge5b select userid,string1,subtype,decimal1,ts from orc_merge5 where userid<=13 +PREHOOK: type: QUERY +PREHOOK: Input: default@orc_merge5 +PREHOOK: Output: default@orc_merge5b +POSTHOOK: query: insert overwrite table orc_merge5b select userid,string1,subtype,decimal1,ts from orc_merge5 where userid<=13 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@orc_merge5 +POSTHOOK: Output: default@orc_merge5b +POSTHOOK: Lineage: orc_merge5b.decimal1 SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:decimal1, type:decimal(10,0), comment:null), ] +POSTHOOK: Lineage: orc_merge5b.string1 SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:string1, type:string, comment:null), ] +POSTHOOK: Lineage: orc_merge5b.subtype SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:subtype, type:double, comment:null), ] +POSTHOOK: Lineage: orc_merge5b.ts SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:ts, type:timestamp, comment:null), ] +POSTHOOK: Lineage: orc_merge5b.userid SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:userid, type:bigint, comment:null), ] +PREHOOK: query: -- 1 file after merging +analyze table orc_merge5b compute statistics noscan +PREHOOK: type: QUERY +PREHOOK: Output: default@orc_merge5b +POSTHOOK: query: -- 1 file after merging +analyze table orc_merge5b compute statistics noscan +POSTHOOK: type: QUERY +POSTHOOK: Output: default@orc_merge5b +PREHOOK: query: desc formatted orc_merge5b +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@orc_merge5b +POSTHOOK: query: desc formatted orc_merge5b +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@orc_merge5b +# col_name data_type comment + +userid bigint +string1 string +subtype double +decimal1 decimal(10,0) +ts timestamp + +# Detailed Table Information +Database: default +#### A masked pattern was here #### +Protect Mode: None +Retention: 0 +#### A masked pattern was here #### +Table Type: MANAGED_TABLE +Table Parameters: + COLUMN_STATS_ACCURATE true + numFiles 1 + numRows 3 + rawDataSize 765 + totalSize 863 +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.ql.io.orc.OrcSerde +InputFormat: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: select * from orc_merge5b +PREHOOK: type: QUERY +PREHOOK: Input: default@orc_merge5b +#### A masked pattern was here #### +POSTHOOK: query: select * from orc_merge5b +POSTHOOK: type: QUERY +POSTHOOK: Input: default@orc_merge5b +#### A masked pattern was here #### +13 bar 80.0 2 1969-12-31 16:00:05 +2 foo 0.8 1 1969-12-31 16:00:00 +5 eat 0.8 6 1969-12-31 16:00:20 +PREHOOK: query: insert overwrite table orc_merge5b select userid,string1,subtype,decimal1,ts from orc_merge5 where userid<=13 +PREHOOK: type: QUERY +PREHOOK: Input: default@orc_merge5 +PREHOOK: Output: default@orc_merge5b +POSTHOOK: query: insert overwrite table orc_merge5b select userid,string1,subtype,decimal1,ts from orc_merge5 where userid<=13 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@orc_merge5 +POSTHOOK: Output: default@orc_merge5b +POSTHOOK: Lineage: orc_merge5b.decimal1 SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:decimal1, type:decimal(10,0), comment:null), ] +POSTHOOK: Lineage: orc_merge5b.string1 SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:string1, type:string, comment:null), ] +POSTHOOK: Lineage: orc_merge5b.subtype SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:subtype, type:double, comment:null), ] +POSTHOOK: Lineage: orc_merge5b.ts SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:ts, type:timestamp, comment:null), ] +POSTHOOK: Lineage: orc_merge5b.userid SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:userid, type:bigint, comment:null), ] +PREHOOK: query: analyze table orc_merge5b compute statistics noscan +PREHOOK: type: QUERY +PREHOOK: Output: default@orc_merge5b +POSTHOOK: query: analyze table orc_merge5b compute statistics noscan +POSTHOOK: type: QUERY +POSTHOOK: Output: default@orc_merge5b +PREHOOK: query: desc formatted orc_merge5b +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@orc_merge5b +POSTHOOK: query: desc formatted orc_merge5b +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@orc_merge5b +# col_name data_type comment + +userid bigint +string1 string +subtype double +decimal1 decimal(10,0) +ts timestamp + +# Detailed Table Information +Database: default +#### A masked pattern was here #### +Protect Mode: None +Retention: 0 +#### A masked pattern was here #### +Table Type: MANAGED_TABLE +Table Parameters: + COLUMN_STATS_ACCURATE true + numFiles 3 + numRows 3 + rawDataSize 765 + totalSize 1088 +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.ql.io.orc.OrcSerde +InputFormat: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: select * from orc_merge5b +PREHOOK: type: QUERY +PREHOOK: Input: default@orc_merge5b +#### A masked pattern was here #### +POSTHOOK: query: select * from orc_merge5b +POSTHOOK: type: QUERY +POSTHOOK: Input: default@orc_merge5b +#### A masked pattern was here #### +13 bar 80.0 2 1969-12-31 16:00:05 +2 foo 0.8 1 1969-12-31 16:00:00 +5 eat 0.8 6 1969-12-31 16:00:20 +PREHOOK: query: explain alter table orc_merge5b concatenate +PREHOOK: type: ALTER_TABLE_MERGE +POSTHOOK: query: explain alter table orc_merge5b concatenate +POSTHOOK: type: ALTER_TABLE_MERGE +STAGE DEPENDENCIES: + Stage-0 is a root stage + Stage-1 depends on stages: Stage-0 + Stage-2 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-0 + + Stage: Stage-1 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + name: default.orc_merge5b + + Stage: Stage-2 + Stats-Aggr Operator + +PREHOOK: query: alter table orc_merge5b concatenate +PREHOOK: type: ALTER_TABLE_MERGE +PREHOOK: Input: default@orc_merge5b +PREHOOK: Output: default@orc_merge5b +POSTHOOK: query: alter table orc_merge5b concatenate +POSTHOOK: type: ALTER_TABLE_MERGE +POSTHOOK: Input: default@orc_merge5b +POSTHOOK: Output: default@orc_merge5b +PREHOOK: query: -- 1 file after merging +analyze table orc_merge5b compute statistics noscan +PREHOOK: type: QUERY +PREHOOK: Output: default@orc_merge5b +POSTHOOK: query: -- 1 file after merging +analyze table orc_merge5b compute statistics noscan +POSTHOOK: type: QUERY +POSTHOOK: Output: default@orc_merge5b +PREHOOK: query: desc formatted orc_merge5b +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@orc_merge5b +POSTHOOK: query: desc formatted orc_merge5b +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@orc_merge5b +# col_name data_type comment + +userid bigint +string1 string +subtype double +decimal1 decimal(10,0) +ts timestamp + +# Detailed Table Information +Database: default +#### A masked pattern was here #### +Protect Mode: None +Retention: 0 +#### A masked pattern was here #### +Table Type: MANAGED_TABLE +Table Parameters: + COLUMN_STATS_ACCURATE true + numFiles 1 + numRows 3 + rawDataSize 765 + totalSize 863 +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.ql.io.orc.OrcSerde +InputFormat: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: select * from orc_merge5b +PREHOOK: type: QUERY +PREHOOK: Input: default@orc_merge5b +#### A masked pattern was here #### +POSTHOOK: query: select * from orc_merge5b +POSTHOOK: type: QUERY +POSTHOOK: Input: default@orc_merge5b +#### A masked pattern was here #### +13 bar 80.0 2 1969-12-31 16:00:05 +2 foo 0.8 1 1969-12-31 16:00:00 +5 eat 0.8 6 1969-12-31 16:00:20 diff --git ql/src/test/results/clientpositive/tez/orc_merge6.q.out ql/src/test/results/clientpositive/tez/orc_merge6.q.out new file mode 100644 index 0000000..1f3239a --- /dev/null +++ ql/src/test/results/clientpositive/tez/orc_merge6.q.out @@ -0,0 +1,838 @@ +PREHOOK: query: -- SORT_QUERY_RESULTS + +-- orc file merge tests for static partitions +create table orc_merge5 (userid bigint, string1 string, subtype double, decimal1 decimal, ts timestamp) stored as orc +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +POSTHOOK: query: -- SORT_QUERY_RESULTS + +-- orc file merge tests for static partitions +create table orc_merge5 (userid bigint, string1 string, subtype double, decimal1 decimal, ts timestamp) stored as orc +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@orc_merge5 +PREHOOK: query: create table orc_merge5a (userid bigint, string1 string, subtype double, decimal1 decimal, ts timestamp) partitioned by (year string, hour int) stored as orc +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +POSTHOOK: query: create table orc_merge5a (userid bigint, string1 string, subtype double, decimal1 decimal, ts timestamp) partitioned by (year string, hour int) stored as orc +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@orc_merge5a +PREHOOK: query: load data local inpath '../../data/files/orc_split_elim.orc' into table orc_merge5 +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@orc_merge5 +POSTHOOK: query: load data local inpath '../../data/files/orc_split_elim.orc' into table orc_merge5 +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@orc_merge5 +PREHOOK: query: -- 3 mappers +explain insert overwrite table orc_merge5a partition (year="2000",hour=24) select userid,string1,subtype,decimal1,ts from orc_merge5 where userid<=13 +PREHOOK: type: QUERY +POSTHOOK: query: -- 3 mappers +explain insert overwrite table orc_merge5a partition (year="2000",hour=24) select userid,string1,subtype,decimal1,ts from orc_merge5 where userid<=13 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + Stage-3 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: orc_merge5 + filterExpr: (userid <= 13) (type: boolean) + Statistics: Num rows: 919 Data size: 246402 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (userid <= 13) (type: boolean) + Statistics: Num rows: 306 Data size: 82044 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: userid (type: bigint), string1 (type: string), subtype (type: double), decimal1 (type: decimal(10,0)), ts (type: timestamp) + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Statistics: Num rows: 306 Data size: 82044 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 306 Data size: 82044 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + name: default.orc_merge5a + + Stage: Stage-2 + Dependency Collection + + Stage: Stage-0 + Move Operator + tables: + partition: + hour 24 + year 2000 + replace: true + table: + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + name: default.orc_merge5a + + Stage: Stage-3 + Stats-Aggr Operator + +PREHOOK: query: insert overwrite table orc_merge5a partition (year="2000",hour=24) select userid,string1,subtype,decimal1,ts from orc_merge5 where userid<=13 +PREHOOK: type: QUERY +PREHOOK: Input: default@orc_merge5 +PREHOOK: Output: default@orc_merge5a@year=2000/hour=24 +POSTHOOK: query: insert overwrite table orc_merge5a partition (year="2000",hour=24) select userid,string1,subtype,decimal1,ts from orc_merge5 where userid<=13 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@orc_merge5 +POSTHOOK: Output: default@orc_merge5a@year=2000/hour=24 +POSTHOOK: Lineage: orc_merge5a PARTITION(year=2000,hour=24).decimal1 SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:decimal1, type:decimal(10,0), comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(year=2000,hour=24).string1 SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:string1, type:string, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(year=2000,hour=24).subtype SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:subtype, type:double, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(year=2000,hour=24).ts SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:ts, type:timestamp, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(year=2000,hour=24).userid SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:userid, type:bigint, comment:null), ] +PREHOOK: query: insert overwrite table orc_merge5a partition (year="2001",hour=24) select userid,string1,subtype,decimal1,ts from orc_merge5 where userid<=13 +PREHOOK: type: QUERY +PREHOOK: Input: default@orc_merge5 +PREHOOK: Output: default@orc_merge5a@year=2001/hour=24 +POSTHOOK: query: insert overwrite table orc_merge5a partition (year="2001",hour=24) select userid,string1,subtype,decimal1,ts from orc_merge5 where userid<=13 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@orc_merge5 +POSTHOOK: Output: default@orc_merge5a@year=2001/hour=24 +POSTHOOK: Lineage: orc_merge5a PARTITION(year=2001,hour=24).decimal1 SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:decimal1, type:decimal(10,0), comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(year=2001,hour=24).string1 SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:string1, type:string, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(year=2001,hour=24).subtype SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:subtype, type:double, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(year=2001,hour=24).ts SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:ts, type:timestamp, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(year=2001,hour=24).userid SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:userid, type:bigint, comment:null), ] +PREHOOK: query: -- 3 files total +analyze table orc_merge5a partition(year="2000",hour=24) compute statistics noscan +PREHOOK: type: QUERY +PREHOOK: Output: default@orc_merge5a +PREHOOK: Output: default@orc_merge5a@year=2000/hour=24 +POSTHOOK: query: -- 3 files total +analyze table orc_merge5a partition(year="2000",hour=24) compute statistics noscan +POSTHOOK: type: QUERY +POSTHOOK: Output: default@orc_merge5a +POSTHOOK: Output: default@orc_merge5a@year=2000/hour=24 +PREHOOK: query: analyze table orc_merge5a partition(year="2001",hour=24) compute statistics noscan +PREHOOK: type: QUERY +PREHOOK: Output: default@orc_merge5a +PREHOOK: Output: default@orc_merge5a@year=2001/hour=24 +POSTHOOK: query: analyze table orc_merge5a partition(year="2001",hour=24) compute statistics noscan +POSTHOOK: type: QUERY +POSTHOOK: Output: default@orc_merge5a +POSTHOOK: Output: default@orc_merge5a@year=2001/hour=24 +PREHOOK: query: desc formatted orc_merge5a partition(year="2000",hour=24) +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@orc_merge5a +POSTHOOK: query: desc formatted orc_merge5a partition(year="2000",hour=24) +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@orc_merge5a +# col_name data_type comment + +userid bigint +string1 string +subtype double +decimal1 decimal(10,0) +ts timestamp + +# Partition Information +# col_name data_type comment + +year string +hour int + +# Detailed Partition Information +Partition Value: [2000, 24] +Database: default +Table: orc_merge5a +#### A masked pattern was here #### +Protect Mode: None +#### A masked pattern was here #### +Partition Parameters: + COLUMN_STATS_ACCURATE true + numFiles 3 + numRows 3 + rawDataSize 765 + totalSize 1088 +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.ql.io.orc.OrcSerde +InputFormat: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: desc formatted orc_merge5a partition(year="2001",hour=24) +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@orc_merge5a +POSTHOOK: query: desc formatted orc_merge5a partition(year="2001",hour=24) +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@orc_merge5a +# col_name data_type comment + +userid bigint +string1 string +subtype double +decimal1 decimal(10,0) +ts timestamp + +# Partition Information +# col_name data_type comment + +year string +hour int + +# Detailed Partition Information +Partition Value: [2001, 24] +Database: default +Table: orc_merge5a +#### A masked pattern was here #### +Protect Mode: None +#### A masked pattern was here #### +Partition Parameters: + COLUMN_STATS_ACCURATE true + numFiles 3 + numRows 3 + rawDataSize 765 + totalSize 1088 +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.ql.io.orc.OrcSerde +InputFormat: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: show partitions orc_merge5a +PREHOOK: type: SHOWPARTITIONS +PREHOOK: Input: default@orc_merge5a +POSTHOOK: query: show partitions orc_merge5a +POSTHOOK: type: SHOWPARTITIONS +POSTHOOK: Input: default@orc_merge5a +year=2000/hour=24 +year=2001/hour=24 +PREHOOK: query: select * from orc_merge5a +PREHOOK: type: QUERY +PREHOOK: Input: default@orc_merge5a +PREHOOK: Input: default@orc_merge5a@year=2000/hour=24 +PREHOOK: Input: default@orc_merge5a@year=2001/hour=24 +#### A masked pattern was here #### +POSTHOOK: query: select * from orc_merge5a +POSTHOOK: type: QUERY +POSTHOOK: Input: default@orc_merge5a +POSTHOOK: Input: default@orc_merge5a@year=2000/hour=24 +POSTHOOK: Input: default@orc_merge5a@year=2001/hour=24 +#### A masked pattern was here #### +13 bar 80.0 2 1969-12-31 16:00:05 2000 24 +13 bar 80.0 2 1969-12-31 16:00:05 2001 24 +2 foo 0.8 1 1969-12-31 16:00:00 2000 24 +2 foo 0.8 1 1969-12-31 16:00:00 2001 24 +5 eat 0.8 6 1969-12-31 16:00:20 2000 24 +5 eat 0.8 6 1969-12-31 16:00:20 2001 24 +PREHOOK: query: -- 3 mappers +explain insert overwrite table orc_merge5a partition (year="2000",hour=24) select userid,string1,subtype,decimal1,ts from orc_merge5 where userid<=13 +PREHOOK: type: QUERY +POSTHOOK: query: -- 3 mappers +explain insert overwrite table orc_merge5a partition (year="2000",hour=24) select userid,string1,subtype,decimal1,ts from orc_merge5 where userid<=13 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-8 depends on stages: Stage-1 , consists of Stage-5, Stage-4, Stage-6 + Stage-5 + Stage-2 depends on stages: Stage-5, Stage-4, Stage-7 + Stage-0 depends on stages: Stage-2 + Stage-3 depends on stages: Stage-0 + Stage-4 + Stage-6 + Stage-7 depends on stages: Stage-6 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: orc_merge5 + filterExpr: (userid <= 13) (type: boolean) + Statistics: Num rows: 919 Data size: 246402 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (userid <= 13) (type: boolean) + Statistics: Num rows: 306 Data size: 82044 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: userid (type: bigint), string1 (type: string), subtype (type: double), decimal1 (type: decimal(10,0)), ts (type: timestamp) + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Statistics: Num rows: 306 Data size: 82044 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 306 Data size: 82044 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + name: default.orc_merge5a + + Stage: Stage-8 + Conditional Operator + + Stage: Stage-5 + Move Operator + files: + hdfs directory: true +#### A masked pattern was here #### + + Stage: Stage-2 + Dependency Collection + + Stage: Stage-0 + Move Operator + tables: + partition: + hour 24 + year 2000 + replace: true + table: + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + name: default.orc_merge5a + + Stage: Stage-3 + Stats-Aggr Operator + + Stage: Stage-4 + Tez +#### A masked pattern was here #### + Vertices: + MergeFileWork + Merge File Work + merge level: stripe + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + + Stage: Stage-6 + Tez +#### A masked pattern was here #### + Vertices: + MergeFileWork + Merge File Work + merge level: stripe + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + + Stage: Stage-7 + Move Operator + files: + hdfs directory: true +#### A masked pattern was here #### + +PREHOOK: query: insert overwrite table orc_merge5a partition (year="2000",hour=24) select userid,string1,subtype,decimal1,ts from orc_merge5 where userid<=13 +PREHOOK: type: QUERY +PREHOOK: Input: default@orc_merge5 +PREHOOK: Output: default@orc_merge5a@year=2000/hour=24 +POSTHOOK: query: insert overwrite table orc_merge5a partition (year="2000",hour=24) select userid,string1,subtype,decimal1,ts from orc_merge5 where userid<=13 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@orc_merge5 +POSTHOOK: Output: default@orc_merge5a@year=2000/hour=24 +POSTHOOK: Lineage: orc_merge5a PARTITION(year=2000,hour=24).decimal1 SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:decimal1, type:decimal(10,0), comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(year=2000,hour=24).string1 SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:string1, type:string, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(year=2000,hour=24).subtype SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:subtype, type:double, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(year=2000,hour=24).ts SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:ts, type:timestamp, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(year=2000,hour=24).userid SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:userid, type:bigint, comment:null), ] +PREHOOK: query: insert overwrite table orc_merge5a partition (year="2001",hour=24) select userid,string1,subtype,decimal1,ts from orc_merge5 where userid<=13 +PREHOOK: type: QUERY +PREHOOK: Input: default@orc_merge5 +PREHOOK: Output: default@orc_merge5a@year=2001/hour=24 +POSTHOOK: query: insert overwrite table orc_merge5a partition (year="2001",hour=24) select userid,string1,subtype,decimal1,ts from orc_merge5 where userid<=13 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@orc_merge5 +POSTHOOK: Output: default@orc_merge5a@year=2001/hour=24 +POSTHOOK: Lineage: orc_merge5a PARTITION(year=2001,hour=24).decimal1 SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:decimal1, type:decimal(10,0), comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(year=2001,hour=24).string1 SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:string1, type:string, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(year=2001,hour=24).subtype SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:subtype, type:double, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(year=2001,hour=24).ts SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:ts, type:timestamp, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(year=2001,hour=24).userid SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:userid, type:bigint, comment:null), ] +PREHOOK: query: -- 1 file after merging +analyze table orc_merge5a partition(year="2000",hour=24) compute statistics noscan +PREHOOK: type: QUERY +PREHOOK: Output: default@orc_merge5a +PREHOOK: Output: default@orc_merge5a@year=2000/hour=24 +POSTHOOK: query: -- 1 file after merging +analyze table orc_merge5a partition(year="2000",hour=24) compute statistics noscan +POSTHOOK: type: QUERY +POSTHOOK: Output: default@orc_merge5a +POSTHOOK: Output: default@orc_merge5a@year=2000/hour=24 +PREHOOK: query: analyze table orc_merge5a partition(year="2001",hour=24) compute statistics noscan +PREHOOK: type: QUERY +PREHOOK: Output: default@orc_merge5a +PREHOOK: Output: default@orc_merge5a@year=2001/hour=24 +POSTHOOK: query: analyze table orc_merge5a partition(year="2001",hour=24) compute statistics noscan +POSTHOOK: type: QUERY +POSTHOOK: Output: default@orc_merge5a +POSTHOOK: Output: default@orc_merge5a@year=2001/hour=24 +PREHOOK: query: desc formatted orc_merge5a partition(year="2000",hour=24) +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@orc_merge5a +POSTHOOK: query: desc formatted orc_merge5a partition(year="2000",hour=24) +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@orc_merge5a +# col_name data_type comment + +userid bigint +string1 string +subtype double +decimal1 decimal(10,0) +ts timestamp + +# Partition Information +# col_name data_type comment + +year string +hour int + +# Detailed Partition Information +Partition Value: [2000, 24] +Database: default +Table: orc_merge5a +#### A masked pattern was here #### +Protect Mode: None +#### A masked pattern was here #### +Partition Parameters: + COLUMN_STATS_ACCURATE true + numFiles 1 + numRows 3 + rawDataSize 765 + totalSize 863 +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.ql.io.orc.OrcSerde +InputFormat: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: desc formatted orc_merge5a partition(year="2001",hour=24) +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@orc_merge5a +POSTHOOK: query: desc formatted orc_merge5a partition(year="2001",hour=24) +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@orc_merge5a +# col_name data_type comment + +userid bigint +string1 string +subtype double +decimal1 decimal(10,0) +ts timestamp + +# Partition Information +# col_name data_type comment + +year string +hour int + +# Detailed Partition Information +Partition Value: [2001, 24] +Database: default +Table: orc_merge5a +#### A masked pattern was here #### +Protect Mode: None +#### A masked pattern was here #### +Partition Parameters: + COLUMN_STATS_ACCURATE true + numFiles 1 + numRows 3 + rawDataSize 765 + totalSize 863 +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.ql.io.orc.OrcSerde +InputFormat: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: show partitions orc_merge5a +PREHOOK: type: SHOWPARTITIONS +PREHOOK: Input: default@orc_merge5a +POSTHOOK: query: show partitions orc_merge5a +POSTHOOK: type: SHOWPARTITIONS +POSTHOOK: Input: default@orc_merge5a +year=2000/hour=24 +year=2001/hour=24 +PREHOOK: query: select * from orc_merge5a +PREHOOK: type: QUERY +PREHOOK: Input: default@orc_merge5a +PREHOOK: Input: default@orc_merge5a@year=2000/hour=24 +PREHOOK: Input: default@orc_merge5a@year=2001/hour=24 +#### A masked pattern was here #### +POSTHOOK: query: select * from orc_merge5a +POSTHOOK: type: QUERY +POSTHOOK: Input: default@orc_merge5a +POSTHOOK: Input: default@orc_merge5a@year=2000/hour=24 +POSTHOOK: Input: default@orc_merge5a@year=2001/hour=24 +#### A masked pattern was here #### +13 bar 80.0 2 1969-12-31 16:00:05 2000 24 +13 bar 80.0 2 1969-12-31 16:00:05 2001 24 +2 foo 0.8 1 1969-12-31 16:00:00 2000 24 +2 foo 0.8 1 1969-12-31 16:00:00 2001 24 +5 eat 0.8 6 1969-12-31 16:00:20 2000 24 +5 eat 0.8 6 1969-12-31 16:00:20 2001 24 +PREHOOK: query: insert overwrite table orc_merge5a partition (year="2000",hour=24) select userid,string1,subtype,decimal1,ts from orc_merge5 where userid<=13 +PREHOOK: type: QUERY +PREHOOK: Input: default@orc_merge5 +PREHOOK: Output: default@orc_merge5a@year=2000/hour=24 +POSTHOOK: query: insert overwrite table orc_merge5a partition (year="2000",hour=24) select userid,string1,subtype,decimal1,ts from orc_merge5 where userid<=13 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@orc_merge5 +POSTHOOK: Output: default@orc_merge5a@year=2000/hour=24 +POSTHOOK: Lineage: orc_merge5a PARTITION(year=2000,hour=24).decimal1 SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:decimal1, type:decimal(10,0), comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(year=2000,hour=24).string1 SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:string1, type:string, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(year=2000,hour=24).subtype SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:subtype, type:double, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(year=2000,hour=24).ts SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:ts, type:timestamp, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(year=2000,hour=24).userid SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:userid, type:bigint, comment:null), ] +PREHOOK: query: insert overwrite table orc_merge5a partition (year="2001",hour=24) select userid,string1,subtype,decimal1,ts from orc_merge5 where userid<=13 +PREHOOK: type: QUERY +PREHOOK: Input: default@orc_merge5 +PREHOOK: Output: default@orc_merge5a@year=2001/hour=24 +POSTHOOK: query: insert overwrite table orc_merge5a partition (year="2001",hour=24) select userid,string1,subtype,decimal1,ts from orc_merge5 where userid<=13 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@orc_merge5 +POSTHOOK: Output: default@orc_merge5a@year=2001/hour=24 +POSTHOOK: Lineage: orc_merge5a PARTITION(year=2001,hour=24).decimal1 SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:decimal1, type:decimal(10,0), comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(year=2001,hour=24).string1 SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:string1, type:string, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(year=2001,hour=24).subtype SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:subtype, type:double, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(year=2001,hour=24).ts SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:ts, type:timestamp, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(year=2001,hour=24).userid SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:userid, type:bigint, comment:null), ] +PREHOOK: query: analyze table orc_merge5a partition(year="2000",hour=24) compute statistics noscan +PREHOOK: type: QUERY +PREHOOK: Output: default@orc_merge5a +PREHOOK: Output: default@orc_merge5a@year=2000/hour=24 +POSTHOOK: query: analyze table orc_merge5a partition(year="2000",hour=24) compute statistics noscan +POSTHOOK: type: QUERY +POSTHOOK: Output: default@orc_merge5a +POSTHOOK: Output: default@orc_merge5a@year=2000/hour=24 +PREHOOK: query: analyze table orc_merge5a partition(year="2001",hour=24) compute statistics noscan +PREHOOK: type: QUERY +PREHOOK: Output: default@orc_merge5a +PREHOOK: Output: default@orc_merge5a@year=2001/hour=24 +POSTHOOK: query: analyze table orc_merge5a partition(year="2001",hour=24) compute statistics noscan +POSTHOOK: type: QUERY +POSTHOOK: Output: default@orc_merge5a +POSTHOOK: Output: default@orc_merge5a@year=2001/hour=24 +PREHOOK: query: desc formatted orc_merge5a partition(year="2000",hour=24) +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@orc_merge5a +POSTHOOK: query: desc formatted orc_merge5a partition(year="2000",hour=24) +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@orc_merge5a +# col_name data_type comment + +userid bigint +string1 string +subtype double +decimal1 decimal(10,0) +ts timestamp + +# Partition Information +# col_name data_type comment + +year string +hour int + +# Detailed Partition Information +Partition Value: [2000, 24] +Database: default +Table: orc_merge5a +#### A masked pattern was here #### +Protect Mode: None +#### A masked pattern was here #### +Partition Parameters: + COLUMN_STATS_ACCURATE true + numFiles 3 + numRows 3 + rawDataSize 765 + totalSize 1088 +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.ql.io.orc.OrcSerde +InputFormat: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: desc formatted orc_merge5a partition(year="2001",hour=24) +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@orc_merge5a +POSTHOOK: query: desc formatted orc_merge5a partition(year="2001",hour=24) +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@orc_merge5a +# col_name data_type comment + +userid bigint +string1 string +subtype double +decimal1 decimal(10,0) +ts timestamp + +# Partition Information +# col_name data_type comment + +year string +hour int + +# Detailed Partition Information +Partition Value: [2001, 24] +Database: default +Table: orc_merge5a +#### A masked pattern was here #### +Protect Mode: None +#### A masked pattern was here #### +Partition Parameters: + COLUMN_STATS_ACCURATE true + numFiles 3 + numRows 3 + rawDataSize 765 + totalSize 1088 +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.ql.io.orc.OrcSerde +InputFormat: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: show partitions orc_merge5a +PREHOOK: type: SHOWPARTITIONS +PREHOOK: Input: default@orc_merge5a +POSTHOOK: query: show partitions orc_merge5a +POSTHOOK: type: SHOWPARTITIONS +POSTHOOK: Input: default@orc_merge5a +year=2000/hour=24 +year=2001/hour=24 +PREHOOK: query: select * from orc_merge5a +PREHOOK: type: QUERY +PREHOOK: Input: default@orc_merge5a +PREHOOK: Input: default@orc_merge5a@year=2000/hour=24 +PREHOOK: Input: default@orc_merge5a@year=2001/hour=24 +#### A masked pattern was here #### +POSTHOOK: query: select * from orc_merge5a +POSTHOOK: type: QUERY +POSTHOOK: Input: default@orc_merge5a +POSTHOOK: Input: default@orc_merge5a@year=2000/hour=24 +POSTHOOK: Input: default@orc_merge5a@year=2001/hour=24 +#### A masked pattern was here #### +13 bar 80.0 2 1969-12-31 16:00:05 2000 24 +13 bar 80.0 2 1969-12-31 16:00:05 2001 24 +2 foo 0.8 1 1969-12-31 16:00:00 2000 24 +2 foo 0.8 1 1969-12-31 16:00:00 2001 24 +5 eat 0.8 6 1969-12-31 16:00:20 2000 24 +5 eat 0.8 6 1969-12-31 16:00:20 2001 24 +PREHOOK: query: explain alter table orc_merge5a partition(year="2000",hour=24) concatenate +PREHOOK: type: ALTER_PARTITION_MERGE +POSTHOOK: query: explain alter table orc_merge5a partition(year="2000",hour=24) concatenate +POSTHOOK: type: ALTER_PARTITION_MERGE +STAGE DEPENDENCIES: + Stage-0 is a root stage + Stage-1 depends on stages: Stage-0 + Stage-2 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-0 + + Stage: Stage-1 + Move Operator + tables: + partition: + hour 24 + year 2000 + replace: true + table: + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + name: default.orc_merge5a + + Stage: Stage-2 + Stats-Aggr Operator + +PREHOOK: query: alter table orc_merge5a partition(year="2000",hour=24) concatenate +PREHOOK: type: ALTER_PARTITION_MERGE +PREHOOK: Input: default@orc_merge5a +PREHOOK: Output: default@orc_merge5a@year=2000/hour=24 +POSTHOOK: query: alter table orc_merge5a partition(year="2000",hour=24) concatenate +POSTHOOK: type: ALTER_PARTITION_MERGE +POSTHOOK: Input: default@orc_merge5a +POSTHOOK: Output: default@orc_merge5a@year=2000/hour=24 +PREHOOK: query: alter table orc_merge5a partition(year="2001",hour=24) concatenate +PREHOOK: type: ALTER_PARTITION_MERGE +PREHOOK: Input: default@orc_merge5a +PREHOOK: Output: default@orc_merge5a@year=2001/hour=24 +POSTHOOK: query: alter table orc_merge5a partition(year="2001",hour=24) concatenate +POSTHOOK: type: ALTER_PARTITION_MERGE +POSTHOOK: Input: default@orc_merge5a +POSTHOOK: Output: default@orc_merge5a@year=2001/hour=24 +PREHOOK: query: -- 1 file after merging +analyze table orc_merge5a partition(year="2000",hour=24) compute statistics noscan +PREHOOK: type: QUERY +PREHOOK: Output: default@orc_merge5a +PREHOOK: Output: default@orc_merge5a@year=2000/hour=24 +POSTHOOK: query: -- 1 file after merging +analyze table orc_merge5a partition(year="2000",hour=24) compute statistics noscan +POSTHOOK: type: QUERY +POSTHOOK: Output: default@orc_merge5a +POSTHOOK: Output: default@orc_merge5a@year=2000/hour=24 +PREHOOK: query: analyze table orc_merge5a partition(year="2001",hour=24) compute statistics noscan +PREHOOK: type: QUERY +PREHOOK: Output: default@orc_merge5a +PREHOOK: Output: default@orc_merge5a@year=2001/hour=24 +POSTHOOK: query: analyze table orc_merge5a partition(year="2001",hour=24) compute statistics noscan +POSTHOOK: type: QUERY +POSTHOOK: Output: default@orc_merge5a +POSTHOOK: Output: default@orc_merge5a@year=2001/hour=24 +PREHOOK: query: desc formatted orc_merge5a partition(year="2000",hour=24) +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@orc_merge5a +POSTHOOK: query: desc formatted orc_merge5a partition(year="2000",hour=24) +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@orc_merge5a +# col_name data_type comment + +userid bigint +string1 string +subtype double +decimal1 decimal(10,0) +ts timestamp + +# Partition Information +# col_name data_type comment + +year string +hour int + +# Detailed Partition Information +Partition Value: [2000, 24] +Database: default +Table: orc_merge5a +#### A masked pattern was here #### +Protect Mode: None +#### A masked pattern was here #### +Partition Parameters: + COLUMN_STATS_ACCURATE true + numFiles 1 + numRows 3 + rawDataSize 765 + totalSize 863 +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.ql.io.orc.OrcSerde +InputFormat: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: desc formatted orc_merge5a partition(year="2001",hour=24) +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@orc_merge5a +POSTHOOK: query: desc formatted orc_merge5a partition(year="2001",hour=24) +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@orc_merge5a +# col_name data_type comment + +userid bigint +string1 string +subtype double +decimal1 decimal(10,0) +ts timestamp + +# Partition Information +# col_name data_type comment + +year string +hour int + +# Detailed Partition Information +Partition Value: [2001, 24] +Database: default +Table: orc_merge5a +#### A masked pattern was here #### +Protect Mode: None +#### A masked pattern was here #### +Partition Parameters: + COLUMN_STATS_ACCURATE true + numFiles 1 + numRows 3 + rawDataSize 765 + totalSize 863 +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.ql.io.orc.OrcSerde +InputFormat: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: show partitions orc_merge5a +PREHOOK: type: SHOWPARTITIONS +PREHOOK: Input: default@orc_merge5a +POSTHOOK: query: show partitions orc_merge5a +POSTHOOK: type: SHOWPARTITIONS +POSTHOOK: Input: default@orc_merge5a +year=2000/hour=24 +year=2001/hour=24 +PREHOOK: query: select * from orc_merge5a +PREHOOK: type: QUERY +PREHOOK: Input: default@orc_merge5a +PREHOOK: Input: default@orc_merge5a@year=2000/hour=24 +PREHOOK: Input: default@orc_merge5a@year=2001/hour=24 +#### A masked pattern was here #### +POSTHOOK: query: select * from orc_merge5a +POSTHOOK: type: QUERY +POSTHOOK: Input: default@orc_merge5a +POSTHOOK: Input: default@orc_merge5a@year=2000/hour=24 +POSTHOOK: Input: default@orc_merge5a@year=2001/hour=24 +#### A masked pattern was here #### +13 bar 80.0 2 1969-12-31 16:00:05 2000 24 +13 bar 80.0 2 1969-12-31 16:00:05 2001 24 +2 foo 0.8 1 1969-12-31 16:00:00 2000 24 +2 foo 0.8 1 1969-12-31 16:00:00 2001 24 +5 eat 0.8 6 1969-12-31 16:00:20 2000 24 +5 eat 0.8 6 1969-12-31 16:00:20 2001 24 diff --git ql/src/test/results/clientpositive/tez/orc_merge7.q.out ql/src/test/results/clientpositive/tez/orc_merge7.q.out new file mode 100644 index 0000000..aa57a83 --- /dev/null +++ ql/src/test/results/clientpositive/tez/orc_merge7.q.out @@ -0,0 +1,941 @@ +PREHOOK: query: -- SORT_QUERY_RESULTS + +-- orc merge file tests for dynamic partition case + +create table orc_merge5 (userid bigint, string1 string, subtype double, decimal1 decimal, ts timestamp) stored as orc +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +POSTHOOK: query: -- SORT_QUERY_RESULTS + +-- orc merge file tests for dynamic partition case + +create table orc_merge5 (userid bigint, string1 string, subtype double, decimal1 decimal, ts timestamp) stored as orc +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@orc_merge5 +PREHOOK: query: create table orc_merge5a (userid bigint, string1 string, subtype double, decimal1 decimal, ts timestamp) partitioned by (st double) stored as orc +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +POSTHOOK: query: create table orc_merge5a (userid bigint, string1 string, subtype double, decimal1 decimal, ts timestamp) partitioned by (st double) stored as orc +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@orc_merge5a +PREHOOK: query: load data local inpath '../../data/files/orc_split_elim.orc' into table orc_merge5 +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@orc_merge5 +POSTHOOK: query: load data local inpath '../../data/files/orc_split_elim.orc' into table orc_merge5 +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@orc_merge5 +PREHOOK: query: -- 3 mappers +explain insert overwrite table orc_merge5a partition (st) select userid,string1,subtype,decimal1,ts,subtype from orc_merge5 +PREHOOK: type: QUERY +POSTHOOK: query: -- 3 mappers +explain insert overwrite table orc_merge5a partition (st) select userid,string1,subtype,decimal1,ts,subtype from orc_merge5 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + Stage-3 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: orc_merge5 + Statistics: Num rows: 919 Data size: 246402 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: userid (type: bigint), string1 (type: string), subtype (type: double), decimal1 (type: decimal(10,0)), ts (type: timestamp), subtype (type: double) + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5 + Statistics: Num rows: 919 Data size: 246402 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 919 Data size: 246402 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + name: default.orc_merge5a + + Stage: Stage-2 + Dependency Collection + + Stage: Stage-0 + Move Operator + tables: + partition: + st + replace: true + table: + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + name: default.orc_merge5a + + Stage: Stage-3 + Stats-Aggr Operator + +PREHOOK: query: insert overwrite table orc_merge5a partition (st) select userid,string1,subtype,decimal1,ts,subtype from orc_merge5 +PREHOOK: type: QUERY +PREHOOK: Input: default@orc_merge5 +PREHOOK: Output: default@orc_merge5a +POSTHOOK: query: insert overwrite table orc_merge5a partition (st) select userid,string1,subtype,decimal1,ts,subtype from orc_merge5 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@orc_merge5 +POSTHOOK: Output: default@orc_merge5a@st=0.8 +POSTHOOK: Output: default@orc_merge5a@st=1.8 +POSTHOOK: Output: default@orc_merge5a@st=8.0 +POSTHOOK: Output: default@orc_merge5a@st=80.0 +POSTHOOK: Lineage: orc_merge5a PARTITION(st=0.8).decimal1 SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:decimal1, type:decimal(10,0), comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=0.8).string1 SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:string1, type:string, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=0.8).subtype SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:subtype, type:double, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=0.8).ts SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:ts, type:timestamp, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=0.8).userid SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:userid, type:bigint, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=1.8).decimal1 SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:decimal1, type:decimal(10,0), comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=1.8).string1 SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:string1, type:string, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=1.8).subtype SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:subtype, type:double, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=1.8).ts SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:ts, type:timestamp, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=1.8).userid SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:userid, type:bigint, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=8.0).decimal1 SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:decimal1, type:decimal(10,0), comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=8.0).string1 SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:string1, type:string, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=8.0).subtype SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:subtype, type:double, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=8.0).ts SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:ts, type:timestamp, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=8.0).userid SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:userid, type:bigint, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=80.0).decimal1 SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:decimal1, type:decimal(10,0), comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=80.0).string1 SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:string1, type:string, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=80.0).subtype SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:subtype, type:double, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=80.0).ts SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:ts, type:timestamp, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=80.0).userid SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:userid, type:bigint, comment:null), ] +PREHOOK: query: insert overwrite table orc_merge5a partition (st) select userid,string1,subtype,decimal1,ts,subtype from orc_merge5 +PREHOOK: type: QUERY +PREHOOK: Input: default@orc_merge5 +PREHOOK: Output: default@orc_merge5a +POSTHOOK: query: insert overwrite table orc_merge5a partition (st) select userid,string1,subtype,decimal1,ts,subtype from orc_merge5 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@orc_merge5 +POSTHOOK: Output: default@orc_merge5a@st=0.8 +POSTHOOK: Output: default@orc_merge5a@st=1.8 +POSTHOOK: Output: default@orc_merge5a@st=8.0 +POSTHOOK: Output: default@orc_merge5a@st=80.0 +POSTHOOK: Lineage: orc_merge5a PARTITION(st=0.8).decimal1 SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:decimal1, type:decimal(10,0), comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=0.8).string1 SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:string1, type:string, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=0.8).subtype SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:subtype, type:double, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=0.8).ts SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:ts, type:timestamp, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=0.8).userid SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:userid, type:bigint, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=1.8).decimal1 SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:decimal1, type:decimal(10,0), comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=1.8).string1 SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:string1, type:string, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=1.8).subtype SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:subtype, type:double, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=1.8).ts SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:ts, type:timestamp, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=1.8).userid SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:userid, type:bigint, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=8.0).decimal1 SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:decimal1, type:decimal(10,0), comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=8.0).string1 SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:string1, type:string, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=8.0).subtype SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:subtype, type:double, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=8.0).ts SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:ts, type:timestamp, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=8.0).userid SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:userid, type:bigint, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=80.0).decimal1 SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:decimal1, type:decimal(10,0), comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=80.0).string1 SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:string1, type:string, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=80.0).subtype SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:subtype, type:double, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=80.0).ts SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:ts, type:timestamp, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=80.0).userid SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:userid, type:bigint, comment:null), ] +PREHOOK: query: -- 3 files total +analyze table orc_merge5a partition(st=80.0) compute statistics noscan +PREHOOK: type: QUERY +PREHOOK: Output: default@orc_merge5a +PREHOOK: Output: default@orc_merge5a@st=80.0 +POSTHOOK: query: -- 3 files total +analyze table orc_merge5a partition(st=80.0) compute statistics noscan +POSTHOOK: type: QUERY +POSTHOOK: Output: default@orc_merge5a +POSTHOOK: Output: default@orc_merge5a@st=80.0 +PREHOOK: query: analyze table orc_merge5a partition(st=0.8) compute statistics noscan +PREHOOK: type: QUERY +PREHOOK: Output: default@orc_merge5a +PREHOOK: Output: default@orc_merge5a@st=0.8 +POSTHOOK: query: analyze table orc_merge5a partition(st=0.8) compute statistics noscan +POSTHOOK: type: QUERY +POSTHOOK: Output: default@orc_merge5a +POSTHOOK: Output: default@orc_merge5a@st=0.8 +PREHOOK: query: desc formatted orc_merge5a partition(st=80.0) +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@orc_merge5a +POSTHOOK: query: desc formatted orc_merge5a partition(st=80.0) +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@orc_merge5a +# col_name data_type comment + +userid bigint +string1 string +subtype double +decimal1 decimal(10,0) +ts timestamp + +# Partition Information +# col_name data_type comment + +st double + +# Detailed Partition Information +Partition Value: [80.0] +Database: default +Table: orc_merge5a +#### A masked pattern was here #### +Protect Mode: None +#### A masked pattern was here #### +Partition Parameters: + COLUMN_STATS_ACCURATE true + numFiles 1 + numRows 1 + rawDataSize 255 + totalSize 498 +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.ql.io.orc.OrcSerde +InputFormat: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: desc formatted orc_merge5a partition(st=0.8) +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@orc_merge5a +POSTHOOK: query: desc formatted orc_merge5a partition(st=0.8) +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@orc_merge5a +# col_name data_type comment + +userid bigint +string1 string +subtype double +decimal1 decimal(10,0) +ts timestamp + +# Partition Information +# col_name data_type comment + +st double + +# Detailed Partition Information +Partition Value: [0.8] +Database: default +Table: orc_merge5a +#### A masked pattern was here #### +Protect Mode: None +#### A masked pattern was here #### +Partition Parameters: + COLUMN_STATS_ACCURATE true + numFiles 2 + numRows 2 + rawDataSize 510 + totalSize 1010 +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.ql.io.orc.OrcSerde +InputFormat: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: show partitions orc_merge5a +PREHOOK: type: SHOWPARTITIONS +PREHOOK: Input: default@orc_merge5a +POSTHOOK: query: show partitions orc_merge5a +POSTHOOK: type: SHOWPARTITIONS +POSTHOOK: Input: default@orc_merge5a +st=0.8 +st=1.8 +st=8.0 +st=80.0 +PREHOOK: query: select * from orc_merge5a where userid<=13 +PREHOOK: type: QUERY +PREHOOK: Input: default@orc_merge5a +PREHOOK: Input: default@orc_merge5a@st=0.8 +PREHOOK: Input: default@orc_merge5a@st=1.8 +PREHOOK: Input: default@orc_merge5a@st=8.0 +PREHOOK: Input: default@orc_merge5a@st=80.0 +#### A masked pattern was here #### +POSTHOOK: query: select * from orc_merge5a where userid<=13 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@orc_merge5a +POSTHOOK: Input: default@orc_merge5a@st=0.8 +POSTHOOK: Input: default@orc_merge5a@st=1.8 +POSTHOOK: Input: default@orc_merge5a@st=8.0 +POSTHOOK: Input: default@orc_merge5a@st=80.0 +#### A masked pattern was here #### +13 bar 80.0 2 1969-12-31 16:00:05 80.0 +2 foo 0.8 1 1969-12-31 16:00:00 0.8 +5 eat 0.8 6 1969-12-31 16:00:20 0.8 +PREHOOK: query: -- 3 mappers +explain insert overwrite table orc_merge5a partition (st) select userid,string1,subtype,decimal1,ts,subtype from orc_merge5 +PREHOOK: type: QUERY +POSTHOOK: query: -- 3 mappers +explain insert overwrite table orc_merge5a partition (st) select userid,string1,subtype,decimal1,ts,subtype from orc_merge5 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-8 depends on stages: Stage-1 , consists of Stage-5, Stage-4, Stage-6 + Stage-5 + Stage-2 depends on stages: Stage-5, Stage-4, Stage-7 + Stage-0 depends on stages: Stage-2 + Stage-3 depends on stages: Stage-0 + Stage-4 + Stage-6 + Stage-7 depends on stages: Stage-6 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: orc_merge5 + Statistics: Num rows: 919 Data size: 246402 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: userid (type: bigint), string1 (type: string), subtype (type: double), decimal1 (type: decimal(10,0)), ts (type: timestamp), subtype (type: double) + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5 + Statistics: Num rows: 919 Data size: 246402 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 919 Data size: 246402 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + name: default.orc_merge5a + + Stage: Stage-8 + Conditional Operator + + Stage: Stage-5 + Move Operator + files: + hdfs directory: true +#### A masked pattern was here #### + + Stage: Stage-2 + Dependency Collection + + Stage: Stage-0 + Move Operator + tables: + partition: + st + replace: true + table: + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + name: default.orc_merge5a + + Stage: Stage-3 + Stats-Aggr Operator + + Stage: Stage-4 + Tez +#### A masked pattern was here #### + Vertices: + MergeFileWork + Merge File Work + merge level: stripe + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + + Stage: Stage-6 + Tez +#### A masked pattern was here #### + Vertices: + MergeFileWork + Merge File Work + merge level: stripe + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + + Stage: Stage-7 + Move Operator + files: + hdfs directory: true +#### A masked pattern was here #### + +PREHOOK: query: insert overwrite table orc_merge5a partition (st) select userid,string1,subtype,decimal1,ts,subtype from orc_merge5 +PREHOOK: type: QUERY +PREHOOK: Input: default@orc_merge5 +PREHOOK: Output: default@orc_merge5a +POSTHOOK: query: insert overwrite table orc_merge5a partition (st) select userid,string1,subtype,decimal1,ts,subtype from orc_merge5 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@orc_merge5 +POSTHOOK: Output: default@orc_merge5a@st=0.8 +POSTHOOK: Output: default@orc_merge5a@st=1.8 +POSTHOOK: Output: default@orc_merge5a@st=8.0 +POSTHOOK: Output: default@orc_merge5a@st=80.0 +POSTHOOK: Lineage: orc_merge5a PARTITION(st=0.8).decimal1 SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:decimal1, type:decimal(10,0), comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=0.8).string1 SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:string1, type:string, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=0.8).subtype SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:subtype, type:double, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=0.8).ts SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:ts, type:timestamp, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=0.8).userid SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:userid, type:bigint, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=1.8).decimal1 SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:decimal1, type:decimal(10,0), comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=1.8).string1 SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:string1, type:string, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=1.8).subtype SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:subtype, type:double, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=1.8).ts SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:ts, type:timestamp, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=1.8).userid SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:userid, type:bigint, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=8.0).decimal1 SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:decimal1, type:decimal(10,0), comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=8.0).string1 SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:string1, type:string, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=8.0).subtype SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:subtype, type:double, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=8.0).ts SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:ts, type:timestamp, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=8.0).userid SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:userid, type:bigint, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=80.0).decimal1 SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:decimal1, type:decimal(10,0), comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=80.0).string1 SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:string1, type:string, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=80.0).subtype SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:subtype, type:double, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=80.0).ts SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:ts, type:timestamp, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=80.0).userid SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:userid, type:bigint, comment:null), ] +PREHOOK: query: insert overwrite table orc_merge5a partition (st) select userid,string1,subtype,decimal1,ts,subtype from orc_merge5 +PREHOOK: type: QUERY +PREHOOK: Input: default@orc_merge5 +PREHOOK: Output: default@orc_merge5a +POSTHOOK: query: insert overwrite table orc_merge5a partition (st) select userid,string1,subtype,decimal1,ts,subtype from orc_merge5 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@orc_merge5 +POSTHOOK: Output: default@orc_merge5a@st=0.8 +POSTHOOK: Output: default@orc_merge5a@st=1.8 +POSTHOOK: Output: default@orc_merge5a@st=8.0 +POSTHOOK: Output: default@orc_merge5a@st=80.0 +POSTHOOK: Lineage: orc_merge5a PARTITION(st=0.8).decimal1 SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:decimal1, type:decimal(10,0), comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=0.8).string1 SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:string1, type:string, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=0.8).subtype SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:subtype, type:double, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=0.8).ts SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:ts, type:timestamp, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=0.8).userid SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:userid, type:bigint, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=1.8).decimal1 SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:decimal1, type:decimal(10,0), comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=1.8).string1 SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:string1, type:string, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=1.8).subtype SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:subtype, type:double, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=1.8).ts SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:ts, type:timestamp, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=1.8).userid SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:userid, type:bigint, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=8.0).decimal1 SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:decimal1, type:decimal(10,0), comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=8.0).string1 SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:string1, type:string, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=8.0).subtype SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:subtype, type:double, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=8.0).ts SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:ts, type:timestamp, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=8.0).userid SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:userid, type:bigint, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=80.0).decimal1 SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:decimal1, type:decimal(10,0), comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=80.0).string1 SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:string1, type:string, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=80.0).subtype SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:subtype, type:double, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=80.0).ts SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:ts, type:timestamp, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=80.0).userid SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:userid, type:bigint, comment:null), ] +PREHOOK: query: -- 1 file after merging +analyze table orc_merge5a partition(st=80.0) compute statistics noscan +PREHOOK: type: QUERY +PREHOOK: Output: default@orc_merge5a +PREHOOK: Output: default@orc_merge5a@st=80.0 +POSTHOOK: query: -- 1 file after merging +analyze table orc_merge5a partition(st=80.0) compute statistics noscan +POSTHOOK: type: QUERY +POSTHOOK: Output: default@orc_merge5a +POSTHOOK: Output: default@orc_merge5a@st=80.0 +PREHOOK: query: analyze table orc_merge5a partition(st=0.8) compute statistics noscan +PREHOOK: type: QUERY +PREHOOK: Output: default@orc_merge5a +PREHOOK: Output: default@orc_merge5a@st=0.8 +POSTHOOK: query: analyze table orc_merge5a partition(st=0.8) compute statistics noscan +POSTHOOK: type: QUERY +POSTHOOK: Output: default@orc_merge5a +POSTHOOK: Output: default@orc_merge5a@st=0.8 +PREHOOK: query: desc formatted orc_merge5a partition(st=80.0) +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@orc_merge5a +POSTHOOK: query: desc formatted orc_merge5a partition(st=80.0) +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@orc_merge5a +# col_name data_type comment + +userid bigint +string1 string +subtype double +decimal1 decimal(10,0) +ts timestamp + +# Partition Information +# col_name data_type comment + +st double + +# Detailed Partition Information +Partition Value: [80.0] +Database: default +Table: orc_merge5a +#### A masked pattern was here #### +Protect Mode: None +#### A masked pattern was here #### +Partition Parameters: + COLUMN_STATS_ACCURATE true + numFiles 1 + numRows 1 + rawDataSize 255 + totalSize 498 +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.ql.io.orc.OrcSerde +InputFormat: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: desc formatted orc_merge5a partition(st=0.8) +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@orc_merge5a +POSTHOOK: query: desc formatted orc_merge5a partition(st=0.8) +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@orc_merge5a +# col_name data_type comment + +userid bigint +string1 string +subtype double +decimal1 decimal(10,0) +ts timestamp + +# Partition Information +# col_name data_type comment + +st double + +# Detailed Partition Information +Partition Value: [0.8] +Database: default +Table: orc_merge5a +#### A masked pattern was here #### +Protect Mode: None +#### A masked pattern was here #### +Partition Parameters: + COLUMN_STATS_ACCURATE true + numFiles 1 + numRows 2 + rawDataSize 510 + totalSize 804 +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.ql.io.orc.OrcSerde +InputFormat: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: show partitions orc_merge5a +PREHOOK: type: SHOWPARTITIONS +PREHOOK: Input: default@orc_merge5a +POSTHOOK: query: show partitions orc_merge5a +POSTHOOK: type: SHOWPARTITIONS +POSTHOOK: Input: default@orc_merge5a +st=0.8 +st=1.8 +st=8.0 +st=80.0 +PREHOOK: query: select * from orc_merge5a where userid<=13 +PREHOOK: type: QUERY +PREHOOK: Input: default@orc_merge5a +PREHOOK: Input: default@orc_merge5a@st=0.8 +PREHOOK: Input: default@orc_merge5a@st=1.8 +PREHOOK: Input: default@orc_merge5a@st=8.0 +PREHOOK: Input: default@orc_merge5a@st=80.0 +#### A masked pattern was here #### +POSTHOOK: query: select * from orc_merge5a where userid<=13 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@orc_merge5a +POSTHOOK: Input: default@orc_merge5a@st=0.8 +POSTHOOK: Input: default@orc_merge5a@st=1.8 +POSTHOOK: Input: default@orc_merge5a@st=8.0 +POSTHOOK: Input: default@orc_merge5a@st=80.0 +#### A masked pattern was here #### +13 bar 80.0 2 1969-12-31 16:00:05 80.0 +2 foo 0.8 1 1969-12-31 16:00:00 0.8 +5 eat 0.8 6 1969-12-31 16:00:20 0.8 +PREHOOK: query: insert overwrite table orc_merge5a partition (st) select userid,string1,subtype,decimal1,ts,subtype from orc_merge5 +PREHOOK: type: QUERY +PREHOOK: Input: default@orc_merge5 +PREHOOK: Output: default@orc_merge5a +POSTHOOK: query: insert overwrite table orc_merge5a partition (st) select userid,string1,subtype,decimal1,ts,subtype from orc_merge5 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@orc_merge5 +POSTHOOK: Output: default@orc_merge5a@st=0.8 +POSTHOOK: Output: default@orc_merge5a@st=1.8 +POSTHOOK: Output: default@orc_merge5a@st=8.0 +POSTHOOK: Output: default@orc_merge5a@st=80.0 +POSTHOOK: Lineage: orc_merge5a PARTITION(st=0.8).decimal1 SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:decimal1, type:decimal(10,0), comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=0.8).string1 SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:string1, type:string, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=0.8).subtype SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:subtype, type:double, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=0.8).ts SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:ts, type:timestamp, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=0.8).userid SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:userid, type:bigint, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=1.8).decimal1 SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:decimal1, type:decimal(10,0), comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=1.8).string1 SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:string1, type:string, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=1.8).subtype SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:subtype, type:double, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=1.8).ts SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:ts, type:timestamp, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=1.8).userid SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:userid, type:bigint, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=8.0).decimal1 SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:decimal1, type:decimal(10,0), comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=8.0).string1 SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:string1, type:string, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=8.0).subtype SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:subtype, type:double, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=8.0).ts SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:ts, type:timestamp, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=8.0).userid SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:userid, type:bigint, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=80.0).decimal1 SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:decimal1, type:decimal(10,0), comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=80.0).string1 SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:string1, type:string, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=80.0).subtype SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:subtype, type:double, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=80.0).ts SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:ts, type:timestamp, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=80.0).userid SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:userid, type:bigint, comment:null), ] +PREHOOK: query: insert overwrite table orc_merge5a partition (st) select userid,string1,subtype,decimal1,ts,subtype from orc_merge5 +PREHOOK: type: QUERY +PREHOOK: Input: default@orc_merge5 +PREHOOK: Output: default@orc_merge5a +POSTHOOK: query: insert overwrite table orc_merge5a partition (st) select userid,string1,subtype,decimal1,ts,subtype from orc_merge5 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@orc_merge5 +POSTHOOK: Output: default@orc_merge5a@st=0.8 +POSTHOOK: Output: default@orc_merge5a@st=1.8 +POSTHOOK: Output: default@orc_merge5a@st=8.0 +POSTHOOK: Output: default@orc_merge5a@st=80.0 +POSTHOOK: Lineage: orc_merge5a PARTITION(st=0.8).decimal1 SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:decimal1, type:decimal(10,0), comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=0.8).string1 SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:string1, type:string, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=0.8).subtype SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:subtype, type:double, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=0.8).ts SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:ts, type:timestamp, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=0.8).userid SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:userid, type:bigint, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=1.8).decimal1 SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:decimal1, type:decimal(10,0), comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=1.8).string1 SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:string1, type:string, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=1.8).subtype SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:subtype, type:double, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=1.8).ts SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:ts, type:timestamp, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=1.8).userid SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:userid, type:bigint, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=8.0).decimal1 SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:decimal1, type:decimal(10,0), comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=8.0).string1 SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:string1, type:string, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=8.0).subtype SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:subtype, type:double, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=8.0).ts SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:ts, type:timestamp, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=8.0).userid SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:userid, type:bigint, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=80.0).decimal1 SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:decimal1, type:decimal(10,0), comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=80.0).string1 SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:string1, type:string, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=80.0).subtype SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:subtype, type:double, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=80.0).ts SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:ts, type:timestamp, comment:null), ] +POSTHOOK: Lineage: orc_merge5a PARTITION(st=80.0).userid SIMPLE [(orc_merge5)orc_merge5.FieldSchema(name:userid, type:bigint, comment:null), ] +PREHOOK: query: analyze table orc_merge5a partition(st=80.0) compute statistics noscan +PREHOOK: type: QUERY +PREHOOK: Output: default@orc_merge5a +PREHOOK: Output: default@orc_merge5a@st=80.0 +POSTHOOK: query: analyze table orc_merge5a partition(st=80.0) compute statistics noscan +POSTHOOK: type: QUERY +POSTHOOK: Output: default@orc_merge5a +POSTHOOK: Output: default@orc_merge5a@st=80.0 +PREHOOK: query: analyze table orc_merge5a partition(st=0.8) compute statistics noscan +PREHOOK: type: QUERY +PREHOOK: Output: default@orc_merge5a +PREHOOK: Output: default@orc_merge5a@st=0.8 +POSTHOOK: query: analyze table orc_merge5a partition(st=0.8) compute statistics noscan +POSTHOOK: type: QUERY +POSTHOOK: Output: default@orc_merge5a +POSTHOOK: Output: default@orc_merge5a@st=0.8 +PREHOOK: query: desc formatted orc_merge5a partition(st=80.0) +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@orc_merge5a +POSTHOOK: query: desc formatted orc_merge5a partition(st=80.0) +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@orc_merge5a +# col_name data_type comment + +userid bigint +string1 string +subtype double +decimal1 decimal(10,0) +ts timestamp + +# Partition Information +# col_name data_type comment + +st double + +# Detailed Partition Information +Partition Value: [80.0] +Database: default +Table: orc_merge5a +#### A masked pattern was here #### +Protect Mode: None +#### A masked pattern was here #### +Partition Parameters: + COLUMN_STATS_ACCURATE true + numFiles 1 + numRows 1 + rawDataSize 255 + totalSize 498 +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.ql.io.orc.OrcSerde +InputFormat: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: desc formatted orc_merge5a partition(st=0.8) +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@orc_merge5a +POSTHOOK: query: desc formatted orc_merge5a partition(st=0.8) +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@orc_merge5a +# col_name data_type comment + +userid bigint +string1 string +subtype double +decimal1 decimal(10,0) +ts timestamp + +# Partition Information +# col_name data_type comment + +st double + +# Detailed Partition Information +Partition Value: [0.8] +Database: default +Table: orc_merge5a +#### A masked pattern was here #### +Protect Mode: None +#### A masked pattern was here #### +Partition Parameters: + COLUMN_STATS_ACCURATE true + numFiles 2 + numRows 2 + rawDataSize 510 + totalSize 1010 +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.ql.io.orc.OrcSerde +InputFormat: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: show partitions orc_merge5a +PREHOOK: type: SHOWPARTITIONS +PREHOOK: Input: default@orc_merge5a +POSTHOOK: query: show partitions orc_merge5a +POSTHOOK: type: SHOWPARTITIONS +POSTHOOK: Input: default@orc_merge5a +st=0.8 +st=1.8 +st=8.0 +st=80.0 +PREHOOK: query: select * from orc_merge5a where userid<=13 +PREHOOK: type: QUERY +PREHOOK: Input: default@orc_merge5a +PREHOOK: Input: default@orc_merge5a@st=0.8 +PREHOOK: Input: default@orc_merge5a@st=1.8 +PREHOOK: Input: default@orc_merge5a@st=8.0 +PREHOOK: Input: default@orc_merge5a@st=80.0 +#### A masked pattern was here #### +POSTHOOK: query: select * from orc_merge5a where userid<=13 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@orc_merge5a +POSTHOOK: Input: default@orc_merge5a@st=0.8 +POSTHOOK: Input: default@orc_merge5a@st=1.8 +POSTHOOK: Input: default@orc_merge5a@st=8.0 +POSTHOOK: Input: default@orc_merge5a@st=80.0 +#### A masked pattern was here #### +13 bar 80.0 2 1969-12-31 16:00:05 80.0 +2 foo 0.8 1 1969-12-31 16:00:00 0.8 +5 eat 0.8 6 1969-12-31 16:00:20 0.8 +PREHOOK: query: explain alter table orc_merge5a partition(st=80.0) concatenate +PREHOOK: type: ALTER_PARTITION_MERGE +POSTHOOK: query: explain alter table orc_merge5a partition(st=80.0) concatenate +POSTHOOK: type: ALTER_PARTITION_MERGE +STAGE DEPENDENCIES: + Stage-0 is a root stage + Stage-1 depends on stages: Stage-0 + Stage-2 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-0 + + Stage: Stage-1 + Move Operator + tables: + partition: + st 80.0 + replace: true + table: + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + name: default.orc_merge5a + + Stage: Stage-2 + Stats-Aggr Operator + +PREHOOK: query: alter table orc_merge5a partition(st=80.0) concatenate +PREHOOK: type: ALTER_PARTITION_MERGE +PREHOOK: Input: default@orc_merge5a +PREHOOK: Output: default@orc_merge5a@st=80.0 +POSTHOOK: query: alter table orc_merge5a partition(st=80.0) concatenate +POSTHOOK: type: ALTER_PARTITION_MERGE +POSTHOOK: Input: default@orc_merge5a +POSTHOOK: Output: default@orc_merge5a@st=80.0 +PREHOOK: query: alter table orc_merge5a partition(st=0.8) concatenate +PREHOOK: type: ALTER_PARTITION_MERGE +PREHOOK: Input: default@orc_merge5a +PREHOOK: Output: default@orc_merge5a@st=0.8 +POSTHOOK: query: alter table orc_merge5a partition(st=0.8) concatenate +POSTHOOK: type: ALTER_PARTITION_MERGE +POSTHOOK: Input: default@orc_merge5a +POSTHOOK: Output: default@orc_merge5a@st=0.8 +PREHOOK: query: -- 1 file after merging +analyze table orc_merge5a partition(st=80.0) compute statistics noscan +PREHOOK: type: QUERY +PREHOOK: Output: default@orc_merge5a +PREHOOK: Output: default@orc_merge5a@st=80.0 +POSTHOOK: query: -- 1 file after merging +analyze table orc_merge5a partition(st=80.0) compute statistics noscan +POSTHOOK: type: QUERY +POSTHOOK: Output: default@orc_merge5a +POSTHOOK: Output: default@orc_merge5a@st=80.0 +PREHOOK: query: analyze table orc_merge5a partition(st=0.8) compute statistics noscan +PREHOOK: type: QUERY +PREHOOK: Output: default@orc_merge5a +PREHOOK: Output: default@orc_merge5a@st=0.8 +POSTHOOK: query: analyze table orc_merge5a partition(st=0.8) compute statistics noscan +POSTHOOK: type: QUERY +POSTHOOK: Output: default@orc_merge5a +POSTHOOK: Output: default@orc_merge5a@st=0.8 +PREHOOK: query: desc formatted orc_merge5a partition(st=80.0) +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@orc_merge5a +POSTHOOK: query: desc formatted orc_merge5a partition(st=80.0) +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@orc_merge5a +# col_name data_type comment + +userid bigint +string1 string +subtype double +decimal1 decimal(10,0) +ts timestamp + +# Partition Information +# col_name data_type comment + +st double + +# Detailed Partition Information +Partition Value: [80.0] +Database: default +Table: orc_merge5a +#### A masked pattern was here #### +Protect Mode: None +#### A masked pattern was here #### +Partition Parameters: + COLUMN_STATS_ACCURATE true + numFiles 1 + numRows 1 + rawDataSize 255 + totalSize 498 +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.ql.io.orc.OrcSerde +InputFormat: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: desc formatted orc_merge5a partition(st=0.8) +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@orc_merge5a +POSTHOOK: query: desc formatted orc_merge5a partition(st=0.8) +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@orc_merge5a +# col_name data_type comment + +userid bigint +string1 string +subtype double +decimal1 decimal(10,0) +ts timestamp + +# Partition Information +# col_name data_type comment + +st double + +# Detailed Partition Information +Partition Value: [0.8] +Database: default +Table: orc_merge5a +#### A masked pattern was here #### +Protect Mode: None +#### A masked pattern was here #### +Partition Parameters: + COLUMN_STATS_ACCURATE true + numFiles 1 + numRows 2 + rawDataSize 510 + totalSize 804 +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.ql.io.orc.OrcSerde +InputFormat: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: show partitions orc_merge5a +PREHOOK: type: SHOWPARTITIONS +PREHOOK: Input: default@orc_merge5a +POSTHOOK: query: show partitions orc_merge5a +POSTHOOK: type: SHOWPARTITIONS +POSTHOOK: Input: default@orc_merge5a +st=0.8 +st=1.8 +st=8.0 +st=80.0 +PREHOOK: query: select * from orc_merge5a where userid<=13 +PREHOOK: type: QUERY +PREHOOK: Input: default@orc_merge5a +PREHOOK: Input: default@orc_merge5a@st=0.8 +PREHOOK: Input: default@orc_merge5a@st=1.8 +PREHOOK: Input: default@orc_merge5a@st=8.0 +PREHOOK: Input: default@orc_merge5a@st=80.0 +#### A masked pattern was here #### +POSTHOOK: query: select * from orc_merge5a where userid<=13 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@orc_merge5a +POSTHOOK: Input: default@orc_merge5a@st=0.8 +POSTHOOK: Input: default@orc_merge5a@st=1.8 +POSTHOOK: Input: default@orc_merge5a@st=8.0 +POSTHOOK: Input: default@orc_merge5a@st=80.0 +#### A masked pattern was here #### +13 bar 80.0 2 1969-12-31 16:00:05 80.0 +2 foo 0.8 1 1969-12-31 16:00:00 0.8 +5 eat 0.8 6 1969-12-31 16:00:20 0.8