diff --git common/src/java/org/apache/hadoop/hive/conf/HiveConf.java common/src/java/org/apache/hadoop/hive/conf/HiveConf.java index 7932a3d..f3682e7 100644 --- common/src/java/org/apache/hadoop/hive/conf/HiveConf.java +++ common/src/java/org/apache/hadoop/hive/conf/HiveConf.java @@ -623,6 +623,7 @@ HIVEOPTPPD("hive.optimize.ppd", true), // predicate pushdown HIVEPPDRECOGNIZETRANSITIVITY("hive.ppd.recognizetransivity", true), // predicate pushdown HIVEPPDREMOVEDUPLICATEFILTERS("hive.ppd.remove.duplicatefilters", true), + HIVEPPDFILES("hive.optimize.ppd.vc.filename", false), HIVEMETADATAONLYQUERIES("hive.optimize.metadataonly", true), // push predicates down to storage handlers HIVEOPTPPD_STORAGE("hive.optimize.ppd.storage", true), diff --git itests/qtest/pom.xml itests/qtest/pom.xml index dc4519a..216435f 100644 --- itests/qtest/pom.xml +++ itests/qtest/pom.xml @@ -39,7 +39,7 @@ stats_counter_partitioned.q,list_bucket_dml_10.q,input16_cc.q,scriptfile1.q,scriptfile1_win.q,bucket4.q,bucketmapjoin6.q,disable_merge_for_bucketing.q,reduce_deduplicate.q,smb_mapjoin_8.q,join1.q,groupby2.q,bucketizedhiveinputformat.q,bucketmapjoin7.q,optrstat_groupby.q,bucket_num_reducers.q,bucket5.q,load_fs2.q,bucket_num_reducers2.q,infer_bucket_sort_merge.q,infer_bucket_sort_reducers_power_two.q,infer_bucket_sort_dyn_part.q,infer_bucket_sort_bucketed_table.q,infer_bucket_sort_map_operators.q,infer_bucket_sort_num_buckets.q,leftsemijoin_mr.q,schemeAuthority.q,schemeAuthority2.q,truncate_column_buckets.q,remote_script.q,,load_hdfs_file_with_space_in_the_name.q,parallel_orderby.q,import_exported_table.q,stats_counter.q,auto_sortmerge_join_16.q,quotedid_smb.q,file_with_header_footer.q,external_table_with_space_in_location_path.q,root_dir_external_table.q,index_bitmap3.q,ql_rewrite_gbtoidx.q,index_bitmap_auto.q,udf_using.q cluster_tasklog_retrieval.q,minimr_broken_pipe.q,mapreduce_stack_trace.q,mapreduce_stack_trace_turnoff.q,mapreduce_stack_trace_hadoop20.q,mapreduce_stack_trace_turnoff_hadoop20.q,file_with_header_footer_negative.q,udf_local_resource.q tez_fsstat.q,mapjoin_decimal.q,tez_join_tests.q,tez_joins_explain.q,mrr.q,tez_dml.q,tez_insert_overwrite_local_directory_1.q,tez_union.q,bucket_map_join_tez1.q,bucket_map_join_tez2.q,tez_schema_evolution.q - cross_product_check_1.q,cross_product_check_2.q,dynpart_sort_opt_vectorization.q,dynpart_sort_optimization.q,orc_analyze.q,join0.q,join1.q,auto_join0.q,auto_join1.q,bucket2.q,bucket3.q,bucket4.q,count.q,create_merge_compressed.q,cross_join.q,ctas.q,custom_input_output_format.q,disable_merge_for_bucketing.q,enforce_order.q,filter_join_breaktask.q,filter_join_breaktask2.q,groupby1.q,groupby2.q,groupby3.q,having.q,insert1.q,insert_into1.q,insert_into2.q,leftsemijoin.q,limit_pushdown.q,load_dyn_part1.q,load_dyn_part2.q,load_dyn_part3.q,mapjoin_mapjoin.q,mapreduce1.q,mapreduce2.q,merge1.q,merge2.q,metadata_only_queries.q,sample1.q,subquery_in.q,subquery_exists.q,vectorization_15.q,ptf.q,stats_counter.q,stats_noscan_1.q,stats_counter_partitioned.q,union2.q,union3.q,union4.q,union5.q,union6.q,union7.q,union8.q,union9.q,transform1.q,transform2.q,transform_ppr1.q,transform_ppr2.q,script_env_var1.q,script_env_var2.q,script_pipe.q,scriptfile1.q + cross_product_check_1.q,cross_product_check_2.q,dynpart_sort_opt_vectorization.q,dynpart_sort_optimization.q,orc_analyze.q,join0.q,join1.q,auto_join0.q,auto_join1.q,bucket2.q,bucket3.q,bucket4.q,count.q,create_merge_compressed.q,cross_join.q,ctas.q,custom_input_output_format.q,disable_merge_for_bucketing.q,enforce_order.q,filter_join_breaktask.q,filter_join_breaktask2.q,groupby1.q,groupby2.q,groupby3.q,having.q,insert1.q,insert_into1.q,insert_into2.q,leftsemijoin.q,limit_pushdown.q,load_dyn_part1.q,load_dyn_part2.q,load_dyn_part3.q,mapjoin_mapjoin.q,mapreduce1.q,mapreduce2.q,merge1.q,merge2.q,metadata_only_queries.q,sample1.q,subquery_in.q,subquery_exists.q,vectorization_15.q,ptf.q,stats_counter.q,stats_noscan_1.q,stats_counter_partitioned.q,union2.q,union3.q,union4.q,union5.q,union6.q,union7.q,union8.q,union9.q,transform1.q,transform2.q,transform_ppr1.q,transform_ppr2.q,script_env_var1.q,script_env_var2.q,script_pipe.q,scriptfile1.q,file_pruning.q add_part_exist.q,alter1.q,alter2.q,alter4.q,alter5.q,alter_rename_partition.q,alter_rename_partition_authorization.q,archive.q,archive_corrupt.q,archive_multi.q,archive_mr_1806.q,archive_multi_mr_1806.q,authorization_1.q,authorization_2.q,authorization_4.q,authorization_5.q,authorization_6.q,authorization_7.q,ba_table1.q,ba_table2.q,ba_table3.q,ba_table_udfs.q,binary_table_bincolserde.q,binary_table_colserde.q,cluster.q,columnarserde_create_shortcut.q,combine2.q,constant_prop.q,create_nested_type.q,create_or_replace_view.q,create_struct_table.q,create_union_table.q,database.q,database_location.q,database_properties.q,ddltime.q,describe_database_json.q,drop_database_removes_partition_dirs.q,escape1.q,escape2.q,exim_00_nonpart_empty.q,exim_01_nonpart.q,exim_02_00_part_empty.q,exim_02_part.q,exim_03_nonpart_over_compat.q,exim_04_all_part.q,exim_04_evolved_parts.q,exim_05_some_part.q,exim_06_one_part.q,exim_07_all_part_over_nonoverlap.q,exim_08_nonpart_rename.q,exim_09_part_spec_nonoverlap.q,exim_10_external_managed.q,exim_11_managed_external.q,exim_12_external_location.q,exim_13_managed_location.q,exim_14_managed_location_over_existing.q,exim_15_external_part.q,exim_16_part_external.q,exim_17_part_managed.q,exim_18_part_external.q,exim_19_00_part_external_location.q,exim_19_part_external_location.q,exim_20_part_managed_location.q,exim_21_export_authsuccess.q,exim_22_import_exist_authsuccess.q,exim_23_import_part_authsuccess.q,exim_24_import_nonexist_authsuccess.q,global_limit.q,groupby_complex_types.q,groupby_complex_types_multi_single_reducer.q,index_auth.q,index_auto.q,index_auto_empty.q,index_bitmap.q,index_bitmap1.q,index_bitmap2.q,index_bitmap3.q,index_bitmap_auto.q,index_bitmap_rc.q,index_compact.q,index_compact_1.q,index_compact_2.q,index_compact_3.q,index_stale_partitioned.q,init_file.q,input16.q,input16_cc.q,input46.q,input_columnarserde.q,input_dynamicserde.q,input_lazyserde.q,input_testxpath3.q,input_testxpath4.q,insert2_overwrite_partitions.q,insertexternal1.q,join_thrift.q,lateral_view.q,load_binary_data.q,load_exist_part_authsuccess.q,load_nonpart_authsuccess.q,load_part_authsuccess.q,loadpart_err.q,lock1.q,lock2.q,lock3.q,lock4.q,merge_dynamic_partition.q,multi_insert.q,multi_insert_move_tasks_share_dependencies.q,null_column.q,ppd_clusterby.q,query_with_semi.q,rename_column.q,sample6.q,sample_islocalmode_hook.q,set_processor_namespaces.q,show_tables.q,source.q,split_sample.q,str_to_map.q,transform1.q,udaf_collect_set.q,udaf_context_ngrams.q,udaf_histogram_numeric.q,udaf_ngrams.q,udaf_percentile_approx.q,udf_array.q,udf_bitmap_and.q,udf_bitmap_or.q,udf_explode.q,udf_format_number.q,udf_map.q,udf_map_keys.q,udf_map_values.q,udf_max.q,udf_min.q,udf_named_struct.q,udf_percentile.q,udf_printf.q,udf_sentences.q,udf_sort_array.q,udf_split.q,udf_struct.q,udf_substr.q,udf_translate.q,udf_union.q,udf_xpath.q,udtf_stack.q,view.q,virtual_column.q diff --git itests/util/src/main/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFPathName.java itests/util/src/main/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFPathName.java new file mode 100644 index 0000000..bcf2fb7 --- /dev/null +++ itests/util/src/main/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFPathName.java @@ -0,0 +1,87 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.udf.generic; + +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDFArgumentException; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector; +import org.apache.hadoop.io.Text; + +@Description(name = "pathname", + value = "_FUNC_(n0) - Returns the final component of input path") +public class GenericUDFPathName extends GenericUDF { + + StringObjectInspector inputOI; + + @Override + public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException { + if (arguments.length != 1 || !(arguments[0] instanceof StringObjectInspector)) { + throw new UDFArgumentException("pathname accepts one string input"); + } + inputOI = (StringObjectInspector) arguments[0]; + return PrimitiveObjectInspectorFactory.javaStringObjectInspector; + } + + @Override + public Object evaluate(DeferredObject[] arguments) throws HiveException { + + if (inputOI.preferWritable()) { + Text text = inputOI.getPrimitiveWritableObject(arguments[0].get()); + if (text == null) { + return null; + } + byte[] bytes = text.getBytes(); + int i = text.getLength() - 1; + for (; i >= 0; i--) { + if (bytes[i] == Path.SEPARATOR_CHAR) { + text.set(bytes, i + 1, text.getLength() - (i + 1)); + break; + } + } + return text; + } + String string = inputOI.getPrimitiveJavaObject(arguments[0].get()); + if (string == null) { + return null; + } + int index = string.lastIndexOf(Path.SEPARATOR_CHAR); + if (index >= 0) { + return string.substring(index + 1, string.length()); + } + return string; + } + + @Override + public String getDisplayString(String[] children) { + StringBuilder sb = new StringBuilder(); + sb.append("pathname("); + for (int i = 0; i < children.length; i++) { + sb.append(children[i]); + if (i + 1 != children.length) { + sb.append(","); + } + } + sb.append(")"); + return sb.toString(); + } +} diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java index a80feb9..c3e5835 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java @@ -1509,14 +1509,25 @@ public static Method getMethodInternal(Class udfClass, List mlist, bo * out of array and getting values out of map. */ public static GenericUDF getGenericUDFForIndex() { - return FunctionRegistry.getFunctionInfo("index").getGenericUDF(); + return getGenericUDF("index"); } /** * A shortcut to get the "and" GenericUDF. */ public static GenericUDF getGenericUDFForAnd() { - return FunctionRegistry.getFunctionInfo("and").getGenericUDF(); + return getGenericUDF("and"); + } + + /** + * A shortcut to get the "or" GenericUDF. + */ + public static GenericUDF getGenericUDFForOr() { + return getGenericUDF("or"); + } + + public static GenericUDF getGenericUDF(String name) { + return FunctionRegistry.getFunctionInfo(name).getGenericUDF(); } /** diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/Task.java ql/src/java/org/apache/hadoop/hive/ql/exec/Task.java index 622ee45..d39c980 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/Task.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/Task.java @@ -515,11 +515,12 @@ public void shutdown() { public List getResultSchema() { return null; } - Throwable getException() { + + protected Throwable getException() { return exception; } - void setException(Throwable ex) { + protected void setException(Throwable ex) { exception = ex; } diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java index 5e5cf97..10899b7 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java @@ -3003,8 +3003,7 @@ public static double getHighestSamplePercentage (MapWork work) { * so we don't want to depend on scratch dir and context. */ public static List getInputPathsTez(JobConf job, MapWork work) throws Exception { - List paths = getInputPaths(job, work, null, null); - return paths; + return getInputPaths(job, work, null); } /** @@ -3021,39 +3020,42 @@ public static double getHighestSamplePercentage (MapWork work) { * @return List of paths to process for the given MapWork * @throws Exception */ - public static List getInputPaths(JobConf job, MapWork work, Path hiveScratchDir, Context ctx) + public static List getInputPaths(JobConf job, MapWork work, Path hiveScratchDir) throws Exception { int sequenceNumber = 0; Set pathsProcessed = new HashSet(); List pathsToAdd = new LinkedList(); // AliasToWork contains all the aliases - for (String alias : work.getAliasToWork().keySet()) { + Map> aliasToWork = work.getAliasToWork(); + Map> pathToAliases = + new LinkedHashMap>(work.getPathToAliases()); + + for (String alias : aliasToWork.keySet()) { LOG.info("Processing alias " + alias); // The alias may not have any path Path path = null; - for (String file : new LinkedList(work.getPathToAliases().keySet())) { - List aliases = work.getPathToAliases().get(file); + for (Map.Entry> entry : pathToAliases.entrySet()) { + String file = entry.getKey(); + List aliases = entry.getValue(); if (aliases.contains(alias)) { path = new Path(file); // Multiple aliases can point to the same path - it should be // processed only once - if (pathsProcessed.contains(path)) { + if (!pathsProcessed.add(path)) { continue; } - pathsProcessed.add(path); - LOG.info("Adding input file " + path); - if (!HiveConf.getVar(job, ConfVars.HIVE_EXECUTION_ENGINE).equals("tez") - && isEmptyPath(job, path, ctx)) { - path = createDummyFileForEmptyPartition(path, job, work, - hiveScratchDir, alias, sequenceNumber++); + List adding = work.getPrunedPaths(path, job); + if (adding.isEmpty() && hiveScratchDir != null) { + adding = Arrays.asList(createDummyFileForEmptyPartition(path, job, work, + hiveScratchDir, alias, sequenceNumber++)); } - pathsToAdd.add(path); + pathsToAdd.addAll(adding); } } @@ -3065,8 +3067,7 @@ public static double getHighestSamplePercentage (MapWork work) { // T2) x; // If T is empty and T2 contains 100 rows, the user expects: 0, 100 (2 // rows) - if (path == null - && !HiveConf.getVar(job, ConfVars.HIVE_EXECUTION_ENGINE).equals("tez")) { + if (path == null && hiveScratchDir != null) { path = createDummyFileForEmptyTable(job, work, hiveScratchDir, alias, sequenceNumber++); pathsToAdd.add(path); diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/mr/ExecDriver.java ql/src/java/org/apache/hadoop/hive/ql/exec/mr/ExecDriver.java index 179ad29..ba457ff 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/mr/ExecDriver.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/mr/ExecDriver.java @@ -36,6 +36,7 @@ import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.filecache.DistributedCache; +import org.apache.hadoop.fs.ContentSummary; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; @@ -103,11 +104,13 @@ private static final long serialVersionUID = 1L; private static final String JOBCONF_FILENAME = "jobconf.xml"; - protected transient JobConf job; - public static MemoryMXBean memoryMXBean; + protected static transient final Log LOG = LogFactory.getLog(ExecDriver.class); + protected static MemoryMXBean memoryMXBean; + protected HadoopJobExecHelper jobExecHelper; - protected static transient final Log LOG = LogFactory.getLog(ExecDriver.class); + protected transient JobConf job; + protected transient ContentSummary inputSummary; private RunningJob rj; @@ -120,6 +123,13 @@ public ExecDriver() { this.jobExecHelper = new HadoopJobExecHelper(job, console, this, this); } + public ContentSummary getInputSummary(Context ctx) throws IOException, HiveException { + if (inputSummary == null) { + inputSummary = getWork().getTotalSummary(ctx, job); + } + return inputSummary; + } + @Override public boolean requireLock() { return true; @@ -219,7 +229,7 @@ public int execute(DriverContext driverContext) { FileSystem fs = emptyScratchDir.getFileSystem(job); fs.mkdirs(emptyScratchDir); } catch (IOException e) { - e.printStackTrace(); + setException(e); console.printError("Error launching map-reduce job", "\n" + org.apache.hadoop.util.StringUtils.stringifyException(e)); return 5; @@ -366,7 +376,7 @@ public int execute(DriverContext driverContext) { } } work.configureJobConf(job); - List inputPaths = Utilities.getInputPaths(job, mWork, emptyScratchDir, ctx); + List inputPaths = Utilities.getInputPaths(job, mWork, emptyScratchDir); Utilities.setInputPaths(job, inputPaths); Utilities.setMapRedWork(job, work, ctx.getMRTmpPath()); @@ -426,7 +436,7 @@ public int execute(DriverContext driverContext) { returnVal = jobExecHelper.progress(rj, jc, ctx.getHiveTxnManager()); success = (returnVal == 0); } catch (Exception e) { - e.printStackTrace(); + setException(e); String mesg = " with exception '" + Utilities.getNameMessage(e) + "'"; if (rj != null) { mesg = "Ended Job = " + rj.getJobID() + mesg; @@ -471,6 +481,7 @@ public int execute(DriverContext driverContext) { } } } catch (Exception e) { + setException(e); // jobClose needs to execute successfully otherwise fail task if (success) { success = false; diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/mr/MapRedTask.java ql/src/java/org/apache/hadoop/hive/ql/exec/mr/MapRedTask.java index 2ce4dbd..5c93854 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/mr/MapRedTask.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/mr/MapRedTask.java @@ -39,6 +39,7 @@ import org.apache.hadoop.hive.ql.exec.Operator; import org.apache.hadoop.hive.ql.exec.Utilities; import org.apache.hadoop.hive.ql.exec.Utilities.StreamPrinter; +import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.plan.MapredWork; import org.apache.hadoop.hive.ql.plan.OperatorDesc; import org.apache.hadoop.hive.ql.plan.ReduceWork; @@ -63,7 +64,6 @@ static final String HIVE_CHILD_CLIENT_DEBUG_OPTS = "HIVE_CHILD_CLIENT_DEBUG_OPTS"; static final String[] HIVE_SYS_PROP = {"build.dir", "build.dir.hive", "hive.query.id"}; - private transient ContentSummary inputSummary = null; private transient boolean runningViaChild = false; private transient long totalInputFileSize; @@ -88,15 +88,13 @@ public int execute(DriverContext driverContext) { } // estimate number of reducers - setNumberOfReducers(); + setNumberOfReducers(ctx); // auto-determine local mode if allowed if (!ctx.isLocalOnlyExecutionMode() && conf.getBoolVar(HiveConf.ConfVars.LOCALMODEAUTO)) { - if (inputSummary == null) { - inputSummary = Utilities.getInputSummary(driverContext.getCtx(), work.getMapWork(), null); - } + ContentSummary inputSummary = getInputSummary(ctx); // set the values of totalInputFileSize and totalInputNumFiles, estimating them // if percentage block sampling is being used @@ -377,7 +375,7 @@ public boolean reduceDone() { /** * Set the number of reducers for the mapred work. */ - private void setNumberOfReducers() throws IOException { + private void setNumberOfReducers(Context ctx) throws IOException, HiveException { ReduceWork rWork = work.getReduceWork(); // this is a temporary hack to fix things that are not fixed in the compiler Integer numReducersFromWork = rWork == null ? 0 : rWork.getNumReduceTasks(); @@ -396,9 +394,7 @@ private void setNumberOfReducers() throws IOException { .printInfo("Number of reduce tasks not specified. Defaulting to jobconf value of: " + reducers); } else { - if (inputSummary == null) { - inputSummary = Utilities.getInputSummary(driverContext.getCtx(), work.getMapWork(), null); - } + ContentSummary inputSummary = getInputSummary(ctx); int reducers = Utilities.estimateNumberOfReducers(conf, inputSummary, work.getMapWork(), work.isFinalMapRed()); rWork.setNumReduceTasks(reducers); diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/tez/TezTask.java ql/src/java/org/apache/hadoop/hive/ql/exec/tez/TezTask.java index 949bcfb..d5f3454 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/tez/TezTask.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/tez/TezTask.java @@ -28,7 +28,6 @@ import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; -import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.ql.Context; import org.apache.hadoop.hive.ql.DriverContext; import org.apache.hadoop.hive.ql.exec.Operator; @@ -171,6 +170,7 @@ public int execute(DriverContext driverContext) { } } } catch (Exception e) { + setException(e); LOG.error("Failed to execute tez graph.", e); // rc will be 1 at this point indicating failure. } finally { diff --git ql/src/java/org/apache/hadoop/hive/ql/index/IndexPredicateAnalyzer.java ql/src/java/org/apache/hadoop/hive/ql/index/IndexPredicateAnalyzer.java index 683618f..4674098 100644 --- ql/src/java/org/apache/hadoop/hive/ql/index/IndexPredicateAnalyzer.java +++ ql/src/java/org/apache/hadoop/hive/ql/index/IndexPredicateAnalyzer.java @@ -62,6 +62,9 @@ private boolean acceptsFields; + private boolean allowAllColumns; + private boolean allowAllFunctions; + public IndexPredicateAnalyzer() { udfNames = new HashSet(); allowedColumnNames = new HashSet(); @@ -71,6 +74,14 @@ public void setFieldValidator(FieldValidator fieldValidator) { this.fieldValidator = fieldValidator; } + public void setAllowAllColumns(boolean allowAllColumns) { + this.allowAllColumns = allowAllColumns; + } + + public void setAllowAllFunctions(boolean allowAllFunctions) { + this.allowAllFunctions = allowAllFunctions; + } + /** * Registers a comparison operator as one which can be satisfied * by an index search. Unless this is called, analyzePredicate @@ -199,11 +210,11 @@ private ExprNodeDesc analyzeExpr( } String udfName = genericUDF.getUdfName(); - if (!udfNames.contains(genericUDF.getUdfName())) { + if (!allowAllFunctions && !udfNames.contains(genericUDF.getUdfName())) { return expr; } - if (!allowedColumnNames.contains(columnDesc.getColumn())) { + if (!allowAllColumns && !allowedColumnNames.contains(columnDesc.getColumn())) { return expr; } diff --git ql/src/java/org/apache/hadoop/hive/ql/io/HiveFileFormatUtils.java ql/src/java/org/apache/hadoop/hive/ql/io/HiveFileFormatUtils.java index c3a83d4..3417e1d 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/HiveFileFormatUtils.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/HiveFileFormatUtils.java @@ -19,7 +19,6 @@ package org.apache.hadoop.hive.ql.io; import java.io.IOException; -import java.io.Serializable; import java.util.ArrayList; import java.util.HashMap; import java.util.List; @@ -35,7 +34,6 @@ import org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter; import org.apache.hadoop.hive.ql.exec.Operator; import org.apache.hadoop.hive.ql.exec.Utilities; -import org.apache.hadoop.hive.ql.io.HivePassThroughOutputFormat; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.plan.FileSinkDesc; import org.apache.hadoop.hive.ql.plan.OperatorDesc; @@ -190,7 +188,7 @@ public static synchronized void registerInputFormatChecker( * get an InputFormatChecker for a file format. */ public static synchronized Class getInputFormatChecker( - Class inputFormat) { + Class inputFormat) { Class result = inputFormatCheckerMap .get(inputFormat); return result; @@ -428,7 +426,6 @@ private static String getMatchingPath(Map> pathToAlias if (foundAlias(pathToAliases, dirPath)) { return dirPath; } - path = dirPath; String dirStr = dir.toString(); int dirPathIndex = dirPath.lastIndexOf(Path.SEPARATOR); diff --git ql/src/java/org/apache/hadoop/hive/ql/io/HiveInputFormat.java ql/src/java/org/apache/hadoop/hive/ql/io/HiveInputFormat.java index 61cc874..dee7ab5 100755 --- ql/src/java/org/apache/hadoop/hive/ql/io/HiveInputFormat.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/HiveInputFormat.java @@ -330,7 +330,8 @@ private void addSplitsForGroup(List dirs, TableScanOperator tableScan, Job get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, "")); // for each dir, get the InputFormat, and do getSplits. for (Path dir : dirs) { - PartitionDesc part = getPartitionDescFromPath(pathToPartitionInfo, dir); + PartitionDesc part = HiveFileFormatUtils. + getPartitionDescFromPathRecursively(pathToPartitionInfo, dir, null); Class inputFormatClass = part.getInputFileFormatClass(); TableDesc table = part.getTableDesc(); TableScanOperator tableScan = null; diff --git ql/src/java/org/apache/hadoop/hive/ql/metadata/FilePruningPredicateHandler.java ql/src/java/org/apache/hadoop/hive/ql/metadata/FilePruningPredicateHandler.java new file mode 100644 index 0000000..ccb79f9 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/metadata/FilePruningPredicateHandler.java @@ -0,0 +1,53 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.metadata; + +import java.util.ArrayList; +import java.util.List; + +import org.apache.hadoop.hive.ql.index.IndexPredicateAnalyzer; +import org.apache.hadoop.hive.ql.index.IndexSearchCondition; +import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; +import org.apache.hadoop.hive.serde2.Deserializer; +import org.apache.hadoop.mapred.JobConf; + +/** + * Extracts file pruning expression (used by MapredWork) + */ +public class FilePruningPredicateHandler implements HiveStoragePredicateHandler { + + @Override + public DecomposedPredicate decomposePredicate(JobConf jobConf, Deserializer deserializer, + ExprNodeDesc predicate) { + IndexPredicateAnalyzer analyzer = new IndexPredicateAnalyzer(); + analyzer.allowColumnName(VirtualColumn.FILENAME.getName()); + analyzer.setAllowAllFunctions(true); + + List searchConditions = new ArrayList(); + ExprNodeDesc residual = analyzer.analyzePredicate(predicate, searchConditions); + + // there is no assertion that pushedPredicate to be applied always, so residue all of them. + DecomposedPredicate decomposedPredicate = new DecomposedPredicate(); + decomposedPredicate.pushedPredicate = analyzer.translateSearchConditions(searchConditions); + + // keep all (todo: can be removed when it's referenced by one alias) + decomposedPredicate.residualPredicate = predicate; + return decomposedPredicate; + } +} diff --git ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveStoragePredicateHandler.java ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveStoragePredicateHandler.java index 7d7c764..67927b8 100644 --- ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveStoragePredicateHandler.java +++ ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveStoragePredicateHandler.java @@ -18,10 +18,9 @@ package org.apache.hadoop.hive.ql.metadata; +import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; -import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc; import org.apache.hadoop.hive.serde2.Deserializer; -import org.apache.hadoop.mapred.JobConf; import java.io.Serializable; @@ -68,7 +67,7 @@ public DecomposedPredicate decomposePredicate( * Portion of predicate to be evaluated by storage handler. Hive * will pass this into the storage handler's input format. */ - public ExprNodeGenericFuncDesc pushedPredicate; + public ExprNodeDesc pushedPredicate; /** * Serialized format for filter @@ -79,6 +78,6 @@ public DecomposedPredicate decomposePredicate( * Portion of predicate to be post-evaluated by Hive for any rows * which are returned by storage handler. */ - public ExprNodeGenericFuncDesc residualPredicate; + public ExprNodeDesc residualPredicate; } } diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/AbstractJoinTaskDispatcher.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/AbstractJoinTaskDispatcher.java index 33ef581..177ff2d 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/AbstractJoinTaskDispatcher.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/AbstractJoinTaskDispatcher.java @@ -124,10 +124,6 @@ public long getTotalKnownInputSize(Context context, MapWork currWork, Map> pathToAliases, HashMap aliasToSize) throws SemanticException { try { - // go over all the input paths, and calculate a known total size, known - // size for each input alias. - Utilities.getInputSummary(context, currWork, null).getLength(); - // set alias to size mapping, this can be used to determine if one table // is chosen as big table, what's the total size of left tables, which // are going to be small tables. @@ -135,7 +131,7 @@ public long getTotalKnownInputSize(Context context, MapWork currWork, for (Map.Entry> entry : pathToAliases.entrySet()) { String path = entry.getKey(); List aliasList = entry.getValue(); - ContentSummary cs = context.getCS(path); + ContentSummary cs = currWork.getSummaryFor(path, context.getConf()); if (cs != null) { long size = cs.getLength(); for (String alias : aliasList) { diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/index/IndexWhereProcessor.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/index/IndexWhereProcessor.java index 5c6751c..a665345 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/index/IndexWhereProcessor.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/index/IndexWhereProcessor.java @@ -18,7 +18,6 @@ package org.apache.hadoop.hive.ql.optimizer.physical.index; -import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.LinkedHashSet; @@ -33,7 +32,6 @@ import org.apache.hadoop.hive.metastore.api.Index; import org.apache.hadoop.hive.ql.exec.TableScanOperator; import org.apache.hadoop.hive.ql.exec.Task; -import org.apache.hadoop.hive.ql.exec.Utilities; import org.apache.hadoop.hive.ql.exec.mr.MapRedTask; import org.apache.hadoop.hive.ql.hooks.ReadEntity; import org.apache.hadoop.hive.ql.index.HiveIndexHandler; @@ -204,14 +202,15 @@ private void rewriteForIndexes(ExprNodeDesc predicate, List indexes, } // check the size + MapredWork work = task.getWork(); try { - ContentSummary inputSummary = Utilities.getInputSummary(pctx.getContext(), task.getWork().getMapWork(), null); + ContentSummary inputSummary = work.getTotalSummary(pctx.getContext(), pctx.getConf()); long inputSize = inputSummary.getLength(); if (!indexHandler.checkQuerySize(inputSize, pctx.getConf())) { queryContext.setQueryTasks(null); return; } - } catch (IOException e) { + } catch (Exception e) { throw new SemanticException("Failed to get task size", e); } diff --git ql/src/java/org/apache/hadoop/hive/ql/parse/MapReduceCompiler.java ql/src/java/org/apache/hadoop/hive/ql/parse/MapReduceCompiler.java index 703c9d1..d7212f6 100644 --- ql/src/java/org/apache/hadoop/hive/ql/parse/MapReduceCompiler.java +++ ql/src/java/org/apache/hadoop/hive/ql/parse/MapReduceCompiler.java @@ -18,7 +18,6 @@ package org.apache.hadoop.hive.ql.parse; -import java.io.IOException; import java.io.Serializable; import java.util.ArrayList; import java.util.HashMap; @@ -205,9 +204,9 @@ public boolean accept(Path file) { boolean hasNonLocalJob = false; for (ExecDriver mrtask : mrtasks) { try { - ContentSummary inputSummary = Utilities.getInputSummary - (ctx, mrtask.getWork().getMapWork(), p); - int numReducers = getNumberOfReducers(mrtask.getWork(), conf); + MapredWork work = mrtask.getWork(); + ContentSummary inputSummary = work.getTotalSummary(lCtx, ctx.getConf()); + int numReducers = getNumberOfReducers(work, conf); long estimatedInput; @@ -240,7 +239,7 @@ public boolean accept(Path file) { } else { mrtask.setLocalMode(true); } - } catch (IOException e) { + } catch (Exception e) { throw new SemanticException(e); } } diff --git ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java index 83d09c0..a8a3103 100644 --- ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java +++ ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java @@ -2357,8 +2357,8 @@ private Operator genNotNullFilterForJoinSourcePlan(QB qb, Operator input, args.add(joinKeys[i]); ExprNodeDesc nextExpr = ExprNodeGenericFuncDesc.newInstance( FunctionRegistry.getFunctionInfo("isnotnull").getGenericUDF(), args); - filterPred = filterPred == null ? nextExpr : ExprNodeDescUtils - .mergePredicates(filterPred, nextExpr); + filterPred = filterPred == null ? nextExpr : + ExprNodeDescUtils.andPredicates(filterPred, nextExpr); } if (filterPred == null) { diff --git ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeDescUtils.java ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeDescUtils.java index f293c43..8d6f0cc 100644 --- ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeDescUtils.java +++ ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeDescUtils.java @@ -19,6 +19,7 @@ package org.apache.hadoop.hive.ql.plan; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; import java.util.Map; @@ -32,6 +33,7 @@ import org.apache.hadoop.hive.ql.udf.generic.GenericUDFBridge; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; +import org.apache.hadoop.hive.serde.serdeConstants; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; import org.apache.hadoop.util.ReflectionUtils; @@ -99,10 +101,30 @@ public static boolean containsPredicate(ExprNodeDesc source, ExprNodeDesc predic return false; } + public static ExprNodeGenericFuncDesc toPredicate(ExprNodeDesc predicate) { + if (predicate instanceof ExprNodeGenericFuncDesc) { + return (ExprNodeGenericFuncDesc) predicate; + } + return new ExprNodeGenericFuncDesc(TypeInfoFactory.booleanTypeInfo, + FunctionRegistry.getGenericUDF(serdeConstants.BOOLEAN_TYPE_NAME), + new ArrayList(Arrays.asList(predicate))); + } + + /** + * bind two predicates by OR op + */ + public static ExprNodeDesc orPredicates(ExprNodeDesc prev, ExprNodeDesc next) { + List children = new ArrayList(2); + children.add(prev); + children.add(next); + return new ExprNodeGenericFuncDesc(TypeInfoFactory.booleanTypeInfo, + FunctionRegistry.getGenericUDFForOr(), children); + } + /** * bind two predicates by AND op */ - public static ExprNodeGenericFuncDesc mergePredicates(ExprNodeDesc prev, ExprNodeDesc next) { + public static ExprNodeDesc andPredicates(ExprNodeDesc prev, ExprNodeDesc next) { List children = new ArrayList(2); children.add(prev); children.add(next); @@ -120,7 +142,7 @@ public static ExprNodeDesc mergePredicates(List exprs) { prev = expr; continue; } - prev = mergePredicates(prev, expr); + prev = andPredicates(prev, expr); } return prev; } @@ -133,7 +155,7 @@ public static ExprNodeDesc mergePredicates(List exprs) { } /** - * split predicates by AND op + * split 'current' by AND op and append to 'splitted' */ public static List split(ExprNodeDesc current, List splitted) { if (FunctionRegistry.isOpAnd(current)) { diff --git ql/src/java/org/apache/hadoop/hive/ql/plan/MapWork.java ql/src/java/org/apache/hadoop/hive/ql/plan/MapWork.java index 9945dea..c952fe8 100644 --- ql/src/java/org/apache/hadoop/hive/ql/plan/MapWork.java +++ ql/src/java/org/apache/hadoop/hive/ql/plan/MapWork.java @@ -18,9 +18,12 @@ package org.apache.hadoop.hive.ql.plan; +import java.io.FileNotFoundException; +import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; +import java.util.Collections; import java.util.HashMap; import java.util.Iterator; import java.util.LinkedHashMap; @@ -32,16 +35,33 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.ContentSummary; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.ql.Context; +import org.apache.hadoop.hive.ql.exec.ExprNodeEvaluator; +import org.apache.hadoop.hive.ql.exec.ExprNodeEvaluatorFactory; import org.apache.hadoop.hive.ql.exec.FileSinkOperator; import org.apache.hadoop.hive.ql.exec.Operator; import org.apache.hadoop.hive.ql.exec.OperatorUtils; +import org.apache.hadoop.hive.ql.exec.TableScanOperator; +import org.apache.hadoop.hive.ql.io.ContentSummaryInputFormat; +import org.apache.hadoop.hive.ql.io.HiveInputFormat; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.metadata.VirtualColumn; import org.apache.hadoop.hive.ql.optimizer.physical.BucketingSortingCtx.BucketCol; import org.apache.hadoop.hive.ql.optimizer.physical.BucketingSortingCtx.SortCol; import org.apache.hadoop.hive.ql.parse.OpParseContext; import org.apache.hadoop.hive.ql.parse.QBJoinTree; import org.apache.hadoop.hive.ql.parse.SplitSample; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.BooleanObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; +import org.apache.hadoop.mapred.InputFormat; import org.apache.hadoop.mapred.JobConf; /** @@ -308,7 +328,8 @@ public String getVectorModeOn() { @Override public void replaceRoots(Map, Operator> replacementMap) { - LinkedHashMap> newAliasToWork = new LinkedHashMap>(); + LinkedHashMap> newAliasToWork = + new LinkedHashMap>(); for (Map.Entry> entry: aliasToWork.entrySet()) { newAliasToWork.put(entry.getKey(), replacementMap.get(entry.getValue())); @@ -534,4 +555,215 @@ public void setVectorMode(boolean vectorMode) { this.vectorMode = vectorMode; } + private static final ObjectInspector VC_FILE_OI = + ObjectInspectorFactory.getStandardStructObjectInspector( + Arrays.asList(VirtualColumn.FILENAME.getName()), + Arrays.asList((ObjectInspector) PrimitiveObjectInspectorFactory.javaStringObjectInspector)); + + private static final BooleanObjectInspector EVAL_OI = + PrimitiveObjectInspectorFactory.writableBooleanObjectInspector; + + private transient Map pathToEvals; + private transient Map summaries; + + public ContentSummary getSummaryFor(String inputPath, Configuration conf) + throws IOException, HiveException { + return getInputSummaryFor(inputPath, conf).toContentSummary(); + } + + // if file pruning is applied, return paths passed the filter. if not, return input path + public List getPrunedPaths(Path inputPath, Configuration conf) + throws IOException, HiveException { + List paths = new ArrayList(); + for (String path : getInputSummaryFor(inputPath.toString(), conf).paths) { + paths.add(new Path(path)); + } + return paths; + } + + private InputSummary getInputSummaryFor(String path, Configuration conf) + throws IOException, HiveException { + InputSummary summary = summaries == null ? null : summaries.get(path); + if (summary == null) { + ExprNodeEvaluator evaluator = getPathToEvals().get(path); + if (summaries == null) { + summaries = new HashMap(); + } + summaries.put(path, summary = summarize(path, conf, evaluator)); + } + return summary; + } + + // get summaries for all input paths and return total of them + public ContentSummary getTotalSummary(Context ctx, Configuration conf) + throws IOException, HiveException { + long length = 0; + long fileCount = 0; + long directoryCount = 0; + for (String path : pathToAliases.keySet()) { + ContentSummary pathSummary = getSummaryFor(path, conf); + if (pathSummary != null) { + length += pathSummary.getLength(); + fileCount += pathSummary.getFileCount(); + directoryCount += pathSummary.getDirectoryCount(); + ctx.addCS(path, pathSummary); + } + } + return new ContentSummary(length, fileCount, directoryCount); + } + + // return or-conjuncted file pruning filter + private ExprNodeEvaluator toFilter(List> operators) throws HiveException { + ExprNodeDesc prev = null; + for (Operator operator : operators) { + if (operator instanceof TableScanOperator && operator.getConf() != null) { + ExprNodeDesc filterExpr = ((TableScanOperator) operator).getConf().getFileFilterExpr(); + if (filterExpr == null) { + continue; + } + if (prev == null) { + prev = filterExpr; + } else { + prev = ExprNodeDescUtils.orPredicates(prev, filterExpr); + } + } + } + if (prev == null) { + return null; + } + ExprNodeEvaluator evaluator = ExprNodeEvaluatorFactory.get(prev); + evaluator.initialize(VC_FILE_OI); + return evaluator; + } + + // evaluate input path with file pruning filter + private InputSummary summarize(String path, Configuration conf, ExprNodeEvaluator evaluator) + throws IOException, HiveException { + Path apath = new Path(path); + Class format = pathToPartitionInfo.get(path).getInputFileFormatClass(); + if (ContentSummaryInputFormat.class.isAssignableFrom(format)) { + JobConf jobConf = new JobConf(conf); + ContentSummaryInputFormat summaryInput = (ContentSummaryInputFormat) + HiveInputFormat.getInputFormatFromCache(format, jobConf); + ContentSummary content = summaryInput.getContentSummary(apath, jobConf); + return new InputSummary(path, content); + } + if (evaluator == null) { + FileSystem fs = apath.getFileSystem(conf); + if (fs.exists(apath)) { + return new InputSummary(path, fs.getContentSummary(apath)); + } + return new InputSummary(path); + } + InputSummary summary = new InputSummary(); + for (FileStatus inputStatus : getInputStatus(apath, conf)) { + String input = normalize(path, inputStatus.getPath().toString()); + Object evaluated = evaluator.evaluate(new String[]{input}); + if (EVAL_OI.get(evaluated)) { + summary.add(input, inputStatus); + } + } + if (summary.paths.isEmpty()) { + // not existing directory + summary.paths.add(path); + } + return summary; + } + + private String normalize(String path, String inputPath) { + int index = inputPath.lastIndexOf(path); + return path + inputPath.substring(index + path.length()); + } + + private List getInputStatus(Path path, Configuration conf) throws IOException { + FileSystem fs = path.getFileSystem(conf); + if (!fs.exists(path)) { + return Collections.emptyList(); + } + FileStatus status; + try { + status = fs.getFileStatus(path); + } catch (FileNotFoundException e) { + return Collections.emptyList(); + } + return iterate(fs, status, new ArrayList()); + } + + private List iterate(FileSystem fs, FileStatus status, List passed) + throws IOException { + if (status.isDir()) { + FileStatus[] children = fs.listStatus(status.getPath()); + if (children != null) { + for (FileStatus child : children) { + iterate(fs, child, passed); + } + } + } else { + passed.add(status); + } + return passed; + } + + private Map getPathToEvals() throws HiveException { + if (pathToEvals == null) { + pathToEvals = new HashMap(); + } + for (Map.Entry> entry : pathToAliases.entrySet()) { + pathToEvals.put(entry.getKey(), toFilter(getWorkForAliases(entry.getValue()))); + } + return pathToEvals; + } + + private List> getWorkForAliases(List aliases) { + List> operators = new ArrayList>(); + for (String alias : aliases) { + Operator work = aliasToWork.get(alias); + if (work == null) { + throw new IllegalStateException("Invalid alias " + alias); + } + operators.add(work); + } + return operators; + } + + private static class InputSummary { + + private boolean exists = false; + + private long length; + private long fileCount; + private long directoryCount; + private List paths; + + public InputSummary() { + paths = new ArrayList(); + } + + public InputSummary(String path) { + paths = Arrays.asList(path); + } + + public InputSummary(String path, ContentSummary content) { + this.paths = Arrays.asList(path); + this.length = content.getLength(); + this.fileCount = content.getFileCount(); + this.directoryCount = content.getDirectoryCount(); + this.exists = true; + } + + public void add(String path, FileStatus status) { + paths.add(path); + length += status.getLen(); + if (!status.isDir()) { + fileCount++; + } else { + directoryCount++; + } + exists = true; + } + + public ContentSummary toContentSummary() { + return !exists ? null : new ContentSummary(length, fileCount, directoryCount); + } + } } diff --git ql/src/java/org/apache/hadoop/hive/ql/plan/MapredWork.java ql/src/java/org/apache/hadoop/hive/ql/plan/MapredWork.java index f3203bf..800ddca 100644 --- ql/src/java/org/apache/hadoop/hive/ql/plan/MapredWork.java +++ ql/src/java/org/apache/hadoop/hive/ql/plan/MapredWork.java @@ -18,10 +18,15 @@ package org.apache.hadoop.hive.ql.plan; +import java.io.IOException; import java.util.ArrayList; import java.util.List; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.ContentSummary; +import org.apache.hadoop.hive.ql.Context; import org.apache.hadoop.hive.ql.exec.Operator; +import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.mapred.JobConf; @@ -80,4 +85,9 @@ public void configureJobConf(JobConf job) { return ops; } + + public ContentSummary getTotalSummary(Context ctx, Configuration conf) + throws IOException, HiveException { + return mapWork.getTotalSummary(ctx, conf); + } } diff --git ql/src/java/org/apache/hadoop/hive/ql/plan/TableScanDesc.java ql/src/java/org/apache/hadoop/hive/ql/plan/TableScanDesc.java index 699b476..e7e504d 100644 --- ql/src/java/org/apache/hadoop/hive/ql/plan/TableScanDesc.java +++ ql/src/java/org/apache/hadoop/hive/ql/plan/TableScanDesc.java @@ -67,6 +67,7 @@ private int maxStatsKeyPrefixLength = -1; private ExprNodeGenericFuncDesc filterExpr; + private ExprNodeGenericFuncDesc fileFilterExpr; private transient Serializable filterObject; // Both neededColumnIDs and neededColumns should never be null. @@ -164,6 +165,15 @@ public void setReferencedColumns(List referencedColumns) { return referencedColumns; } + @Explain(displayName = "fileFilterExpr") + public ExprNodeGenericFuncDesc getFileFilterExpr() { + return fileFilterExpr; + } + + public void setFileFilterExpr(ExprNodeGenericFuncDesc fileFilterExpr) { + this.fileFilterExpr = fileFilterExpr; + } + public void setAlias(String alias) { this.alias = alias; } diff --git ql/src/java/org/apache/hadoop/hive/ql/ppd/OpProcFactory.java ql/src/java/org/apache/hadoop/hive/ql/ppd/OpProcFactory.java index 7aaf455..968732b 100644 --- ql/src/java/org/apache/hadoop/hive/ql/ppd/OpProcFactory.java +++ ql/src/java/org/apache/hadoop/hive/ql/ppd/OpProcFactory.java @@ -44,6 +44,7 @@ import org.apache.hadoop.hive.ql.lib.Node; import org.apache.hadoop.hive.ql.lib.NodeProcessor; import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx; +import org.apache.hadoop.hive.ql.metadata.FilePruningPredicateHandler; import org.apache.hadoop.hive.ql.metadata.HiveStorageHandler; import org.apache.hadoop.hive.ql.metadata.HiveStoragePredicateHandler; import org.apache.hadoop.hive.ql.metadata.Table; @@ -780,7 +781,7 @@ protected static Object createFilter(Operator op, * by Hive as a post-filter, or null if it was possible * to push down the entire predicate */ - private static ExprNodeGenericFuncDesc pushFilterToStorageHandler( + private static ExprNodeDesc pushFilterToStorageHandler( TableScanOperator tableScanOp, ExprNodeGenericFuncDesc originalPredicate, OpWalkerInfo owi, @@ -794,11 +795,19 @@ private static ExprNodeGenericFuncDesc pushFilterToStorageHandler( // optimizations are applied tableScanDesc.setFilterExpr(originalPredicate); } - if (!tbl.isNonNative()) { + + HiveStoragePredicateHandler predicateHandler = null; + if (tbl.isNonNative()) { + HiveStorageHandler storageHandler = tbl.getStorageHandler(); + if (storageHandler instanceof HiveStoragePredicateHandler) { + predicateHandler = (HiveStoragePredicateHandler) storageHandler; + } + } else if (HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVEPPDFILES)) { + predicateHandler = new FilePruningPredicateHandler(); + } else { return originalPredicate; } - HiveStorageHandler storageHandler = tbl.getStorageHandler(); - if (!(storageHandler instanceof HiveStoragePredicateHandler)) { + if (predicateHandler == null) { // The storage handler does not provide predicate decomposition // support, so we'll implement the entire filter in Hive. However, // we still provide the full predicate to the storage handler in @@ -806,8 +815,7 @@ private static ExprNodeGenericFuncDesc pushFilterToStorageHandler( tableScanDesc.setFilterExpr(originalPredicate); return originalPredicate; } - HiveStoragePredicateHandler predicateHandler = - (HiveStoragePredicateHandler) storageHandler; + JobConf jobConf = new JobConf(owi.getParseContext().getConf()); Utilities.setColumnNameList(jobConf, tableScanOp); Utilities.setColumnTypeList(jobConf, tableScanOp); @@ -842,10 +850,17 @@ private static ExprNodeGenericFuncDesc pushFilterToStorageHandler( + decomposed.residualPredicate.getExprString()); } } - tableScanDesc.setFilterExpr(decomposed.pushedPredicate); - tableScanDesc.setFilterObject(decomposed.pushedPredicateObject); - return (ExprNodeGenericFuncDesc)decomposed.residualPredicate; + if (decomposed.pushedPredicate != null) { + ExprNodeGenericFuncDesc predicate = + ExprNodeDescUtils.toPredicate(decomposed.pushedPredicate); + if (predicateHandler instanceof FilePruningPredicateHandler) { + tableScanDesc.setFileFilterExpr(predicate); + } else { + tableScanDesc.setFilterExpr(predicate); + } + } + return decomposed.residualPredicate; } public static NodeProcessor getFilterProc() { diff --git ql/src/test/org/apache/hadoop/hive/ql/io/TestSymlinkTextInputFormat.java ql/src/test/org/apache/hadoop/hive/ql/io/TestSymlinkTextInputFormat.java index 41243fe..d23146f 100644 --- ql/src/test/org/apache/hadoop/hive/ql/io/TestSymlinkTextInputFormat.java +++ ql/src/test/org/apache/hadoop/hive/ql/io/TestSymlinkTextInputFormat.java @@ -37,7 +37,6 @@ import org.apache.hadoop.hive.ql.Driver; import org.apache.hadoop.hive.ql.QueryPlan; import org.apache.hadoop.hive.ql.exec.Utilities; -import org.apache.hadoop.hive.ql.exec.mr.ExecDriver; import org.apache.hadoop.hive.ql.exec.mr.MapRedTask; import org.apache.hadoop.hive.ql.plan.MapredWork; import org.apache.hadoop.hive.ql.plan.PartitionDesc; @@ -177,7 +176,7 @@ public void testCombine() throws Exception { QueryPlan plan = drv.getPlan(); MapRedTask selectTask = (MapRedTask)plan.getRootTasks().get(0); - List inputPaths = Utilities.getInputPaths(newJob, selectTask.getWork().getMapWork(), emptyScratchDir, ctx); + List inputPaths = Utilities.getInputPaths(newJob, selectTask.getWork().getMapWork(), emptyScratchDir); Utilities.setInputPaths(newJob, inputPaths); Utilities.setMapRedWork(newJob, selectTask.getWork(), ctx.getMRTmpPath()); diff --git ql/src/test/queries/clientpositive/file_pruning.q ql/src/test/queries/clientpositive/file_pruning.q new file mode 100644 index 0000000..8231c0d --- /dev/null +++ ql/src/test/queries/clientpositive/file_pruning.q @@ -0,0 +1,17 @@ +-- SORT_QUERY_RESULTS + +create temporary function pathname as 'org.apache.hadoop.hive.ql.udf.generic.GenericUDFPathName'; + +set hive.optimize.ppd.vc.filename=true; +set hive.auto.convert.join=false; + +-- HIVE-1662 File Prunning by filter on INPUT__FILE__NAME(VC) +-- srcbucket2 has 4 files (srcbucket20, srcbucket21, srcbucket22, srcbucket23) +-- and selects srcbucket20 and srcbucket23 as input + +explain extended +select * from (select key, value, pathname(INPUT__FILE__NAME) as filename from srcbucket2) A + where filename rlike 'srcbucket2[03].txt' AND key < 100 order by filename, key, value; + +select * from (select key, value, pathname(INPUT__FILE__NAME) as filename from srcbucket2) A + where filename rlike 'srcbucket2[03].txt' AND key < 100 order by filename, key, value; diff --git ql/src/test/results/clientpositive/file_pruning.q.out ql/src/test/results/clientpositive/file_pruning.q.out new file mode 100644 index 0000000..0663456 --- /dev/null +++ ql/src/test/results/clientpositive/file_pruning.q.out @@ -0,0 +1,257 @@ +PREHOOK: query: -- SORT_QUERY_RESULTS + +create temporary function pathname as 'org.apache.hadoop.hive.ql.udf.generic.GenericUDFPathName' +PREHOOK: type: CREATEFUNCTION +PREHOOK: Output: database:default +POSTHOOK: query: -- SORT_QUERY_RESULTS + +create temporary function pathname as 'org.apache.hadoop.hive.ql.udf.generic.GenericUDFPathName' +POSTHOOK: type: CREATEFUNCTION +POSTHOOK: Output: database:default +PREHOOK: query: -- HIVE-1662 File Prunning by filter on INPUT__FILE__NAME(VC) +-- srcbucket2 has 4 files (srcbucket20, srcbucket21, srcbucket22, srcbucket23) +-- and selects srcbucket20 and srcbucket23 as input + +explain extended +select * from (select key, value, pathname(INPUT__FILE__NAME) as filename from srcbucket2) A + where filename rlike 'srcbucket2[03].txt' AND key < 100 order by filename, key, value +PREHOOK: type: QUERY +POSTHOOK: query: -- HIVE-1662 File Prunning by filter on INPUT__FILE__NAME(VC) +-- srcbucket2 has 4 files (srcbucket20, srcbucket21, srcbucket22, srcbucket23) +-- and selects srcbucket20 and srcbucket23 as input + +explain extended +select * from (select key, value, pathname(INPUT__FILE__NAME) as filename from srcbucket2) A + where filename rlike 'srcbucket2[03].txt' AND key < 100 order by filename, key, value +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + +TOK_QUERY + TOK_FROM + TOK_SUBQUERY + TOK_QUERY + TOK_FROM + TOK_TABREF + TOK_TABNAME + srcbucket2 + TOK_INSERT + TOK_DESTINATION + TOK_DIR + TOK_TMP_FILE + TOK_SELECT + TOK_SELEXPR + TOK_TABLE_OR_COL + key + TOK_SELEXPR + TOK_TABLE_OR_COL + value + TOK_SELEXPR + TOK_FUNCTION + pathname + TOK_TABLE_OR_COL + INPUT__FILE__NAME + filename + A + TOK_INSERT + TOK_DESTINATION + TOK_DIR + TOK_TMP_FILE + TOK_SELECT + TOK_SELEXPR + TOK_ALLCOLREF + TOK_WHERE + AND + rlike + TOK_TABLE_OR_COL + filename + 'srcbucket2[03].txt' + < + TOK_TABLE_OR_COL + key + 100 + TOK_ORDERBY + TOK_TABSORTCOLNAMEASC + TOK_TABLE_OR_COL + filename + TOK_TABSORTCOLNAMEASC + TOK_TABLE_OR_COL + key + TOK_TABSORTCOLNAMEASC + TOK_TABLE_OR_COL + value + + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: srcbucket2 + Statistics: Num rows: 55 Data size: 5812 Basic stats: COMPLETE Column stats: NONE + GatherStats: false + Filter Operator + isSamplingPred: false + predicate: (key < 100) (type: boolean) + Statistics: Num rows: 18 Data size: 1902 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: int), value (type: string), pathname(INPUT__FILE__NAME) (type: string) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 18 Data size: 1902 Basic stats: COMPLETE Column stats: NONE + Filter Operator + isSamplingPred: false + predicate: (_col2 rlike 'srcbucket2[03].txt') (type: boolean) + Statistics: Num rows: 9 Data size: 951 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: int), _col1 (type: string), _col2 (type: string) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 9 Data size: 951 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col2 (type: string), _col0 (type: int), _col1 (type: string) + sort order: +++ + Statistics: Num rows: 9 Data size: 951 Basic stats: COMPLETE Column stats: NONE + tag: -1 + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: srcbucket2 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + COLUMN_STATS_ACCURATE true + bucket_count 4 + bucket_field_name key + columns key,value + columns.comments + columns.types int:string +#### A masked pattern was here #### + name default.srcbucket2 + numFiles 4 + numRows 0 + rawDataSize 0 + serialization.ddl struct srcbucket2 { i32 key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + COLUMN_STATS_ACCURATE true + bucket_count 4 + bucket_field_name key + columns key,value + columns.comments + columns.types int:string +#### A masked pattern was here #### + name default.srcbucket2 + numFiles 4 + numRows 0 + rawDataSize 0 + serialization.ddl struct srcbucket2 { i32 key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.srcbucket2 + name: default.srcbucket2 + Truncated Path -> Alias: + /srcbucket2 [a:srcbucket2] + Needs Tagging: false + Reduce Operator Tree: + Select Operator + expressions: KEY.reducesinkkey1 (type: int), KEY.reducesinkkey2 (type: string), KEY.reducesinkkey0 (type: string) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 9 Data size: 951 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + Statistics: Num rows: 9 Data size: 951 Basic stats: COMPLETE Column stats: NONE +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1,_col2 + columns.types int:string:string + escape.delim \ + hive.serialization.extend.nesting.levels true + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select * from (select key, value, pathname(INPUT__FILE__NAME) as filename from srcbucket2) A + where filename rlike 'srcbucket2[03].txt' AND key < 100 order by filename, key, value +PREHOOK: type: QUERY +PREHOOK: Input: default@srcbucket2 +#### A masked pattern was here #### +POSTHOOK: query: select * from (select key, value, pathname(INPUT__FILE__NAME) as filename from srcbucket2) A + where filename rlike 'srcbucket2[03].txt' AND key < 100 order by filename, key, value +POSTHOOK: type: QUERY +POSTHOOK: Input: default@srcbucket2 +#### A masked pattern was here #### +0 val_0 srcbucket20.txt +0 val_0 srcbucket20.txt +0 val_0 srcbucket20.txt +10 val_10 srcbucket23.txt +11 val_11 srcbucket20.txt +15 val_15 srcbucket20.txt +15 val_15 srcbucket20.txt +18 val_18 srcbucket23.txt +18 val_18 srcbucket23.txt +19 val_19 srcbucket20.txt +26 val_26 srcbucket20.txt +26 val_26 srcbucket20.txt +33 val_33 srcbucket20.txt +37 val_37 srcbucket20.txt +37 val_37 srcbucket20.txt +4 val_4 srcbucket20.txt +43 val_43 srcbucket23.txt +44 val_44 srcbucket20.txt +47 val_47 srcbucket23.txt +51 val_51 srcbucket20.txt +51 val_51 srcbucket20.txt +54 val_54 srcbucket23.txt +58 val_58 srcbucket23.txt +58 val_58 srcbucket23.txt +65 val_65 srcbucket23.txt +66 val_66 srcbucket20.txt +69 val_69 srcbucket23.txt +72 val_72 srcbucket23.txt +72 val_72 srcbucket23.txt +76 val_76 srcbucket23.txt +76 val_76 srcbucket23.txt +77 val_77 srcbucket20.txt +8 val_8 srcbucket20.txt +80 val_80 srcbucket20.txt +83 val_83 srcbucket23.txt +83 val_83 srcbucket23.txt +84 val_84 srcbucket20.txt +84 val_84 srcbucket20.txt +87 val_87 srcbucket23.txt +90 val_90 srcbucket23.txt +90 val_90 srcbucket23.txt +90 val_90 srcbucket23.txt +95 val_95 srcbucket20.txt +95 val_95 srcbucket20.txt +98 val_98 srcbucket23.txt +98 val_98 srcbucket23.txt diff --git ql/src/test/results/clientpositive/tez/file_pruning.q.out ql/src/test/results/clientpositive/tez/file_pruning.q.out new file mode 100644 index 0000000..b352c0a --- /dev/null +++ ql/src/test/results/clientpositive/tez/file_pruning.q.out @@ -0,0 +1,263 @@ +PREHOOK: query: -- SORT_QUERY_RESULTS + +create temporary function pathname as 'org.apache.hadoop.hive.ql.udf.generic.GenericUDFPathName' +PREHOOK: type: CREATEFUNCTION +PREHOOK: Output: database:default +POSTHOOK: query: -- SORT_QUERY_RESULTS + +create temporary function pathname as 'org.apache.hadoop.hive.ql.udf.generic.GenericUDFPathName' +POSTHOOK: type: CREATEFUNCTION +POSTHOOK: Output: database:default +PREHOOK: query: -- HIVE-1662 File Prunning by filter on INPUT__FILE__NAME(VC) +-- srcbucket2 has 4 files (srcbucket20, srcbucket21, srcbucket22, srcbucket23) +-- and selects srcbucket20 and srcbucket23 as input + +explain extended +select * from (select key, value, pathname(INPUT__FILE__NAME) as filename from srcbucket2) A + where filename rlike 'srcbucket2[03].txt' AND key < 100 order by filename, key, value +PREHOOK: type: QUERY +POSTHOOK: query: -- HIVE-1662 File Prunning by filter on INPUT__FILE__NAME(VC) +-- srcbucket2 has 4 files (srcbucket20, srcbucket21, srcbucket22, srcbucket23) +-- and selects srcbucket20 and srcbucket23 as input + +explain extended +select * from (select key, value, pathname(INPUT__FILE__NAME) as filename from srcbucket2) A + where filename rlike 'srcbucket2[03].txt' AND key < 100 order by filename, key, value +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + +TOK_QUERY + TOK_FROM + TOK_SUBQUERY + TOK_QUERY + TOK_FROM + TOK_TABREF + TOK_TABNAME + srcbucket2 + TOK_INSERT + TOK_DESTINATION + TOK_DIR + TOK_TMP_FILE + TOK_SELECT + TOK_SELEXPR + TOK_TABLE_OR_COL + key + TOK_SELEXPR + TOK_TABLE_OR_COL + value + TOK_SELEXPR + TOK_FUNCTION + pathname + TOK_TABLE_OR_COL + INPUT__FILE__NAME + filename + A + TOK_INSERT + TOK_DESTINATION + TOK_DIR + TOK_TMP_FILE + TOK_SELECT + TOK_SELEXPR + TOK_ALLCOLREF + TOK_WHERE + AND + rlike + TOK_TABLE_OR_COL + filename + 'srcbucket2[03].txt' + < + TOK_TABLE_OR_COL + key + 100 + TOK_ORDERBY + TOK_TABSORTCOLNAMEASC + TOK_TABLE_OR_COL + filename + TOK_TABSORTCOLNAMEASC + TOK_TABLE_OR_COL + key + TOK_TABSORTCOLNAMEASC + TOK_TABLE_OR_COL + value + + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: srcbucket2 + Statistics: Num rows: 55 Data size: 5812 Basic stats: COMPLETE Column stats: NONE + GatherStats: false + Filter Operator + isSamplingPred: false + predicate: (key < 100) (type: boolean) + Statistics: Num rows: 18 Data size: 1902 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: int), value (type: string), pathname(INPUT__FILE__NAME) (type: string) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 18 Data size: 1902 Basic stats: COMPLETE Column stats: NONE + Filter Operator + isSamplingPred: false + predicate: (_col2 rlike 'srcbucket2[03].txt') (type: boolean) + Statistics: Num rows: 9 Data size: 951 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: int), _col1 (type: string), _col2 (type: string) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 9 Data size: 951 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col2 (type: string), _col0 (type: int), _col1 (type: string) + sort order: +++ + Statistics: Num rows: 9 Data size: 951 Basic stats: COMPLETE Column stats: NONE + tag: -1 + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: srcbucket2 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + COLUMN_STATS_ACCURATE true + bucket_count 4 + bucket_field_name key + columns key,value + columns.comments + columns.types int:string +#### A masked pattern was here #### + name default.srcbucket2 + numFiles 4 + numRows 0 + rawDataSize 0 + serialization.ddl struct srcbucket2 { i32 key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + COLUMN_STATS_ACCURATE true + bucket_count 4 + bucket_field_name key + columns key,value + columns.comments + columns.types int:string +#### A masked pattern was here #### + name default.srcbucket2 + numFiles 4 + numRows 0 + rawDataSize 0 + serialization.ddl struct srcbucket2 { i32 key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.srcbucket2 + name: default.srcbucket2 + Truncated Path -> Alias: + /srcbucket2 [srcbucket2] + Reducer 2 + Needs Tagging: false + Reduce Operator Tree: + Select Operator + expressions: KEY.reducesinkkey1 (type: int), KEY.reducesinkkey2 (type: string), KEY.reducesinkkey0 (type: string) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 9 Data size: 951 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + Statistics: Num rows: 9 Data size: 951 Basic stats: COMPLETE Column stats: NONE +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1,_col2 + columns.types int:string:string + escape.delim \ + hive.serialization.extend.nesting.levels true + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select * from (select key, value, pathname(INPUT__FILE__NAME) as filename from srcbucket2) A + where filename rlike 'srcbucket2[03].txt' AND key < 100 order by filename, key, value +PREHOOK: type: QUERY +PREHOOK: Input: default@srcbucket2 +#### A masked pattern was here #### +POSTHOOK: query: select * from (select key, value, pathname(INPUT__FILE__NAME) as filename from srcbucket2) A + where filename rlike 'srcbucket2[03].txt' AND key < 100 order by filename, key, value +POSTHOOK: type: QUERY +POSTHOOK: Input: default@srcbucket2 +#### A masked pattern was here #### +0 val_0 srcbucket20.txt +0 val_0 srcbucket20.txt +0 val_0 srcbucket20.txt +10 val_10 srcbucket23.txt +11 val_11 srcbucket20.txt +15 val_15 srcbucket20.txt +15 val_15 srcbucket20.txt +18 val_18 srcbucket23.txt +18 val_18 srcbucket23.txt +19 val_19 srcbucket20.txt +26 val_26 srcbucket20.txt +26 val_26 srcbucket20.txt +33 val_33 srcbucket20.txt +37 val_37 srcbucket20.txt +37 val_37 srcbucket20.txt +4 val_4 srcbucket20.txt +43 val_43 srcbucket23.txt +44 val_44 srcbucket20.txt +47 val_47 srcbucket23.txt +51 val_51 srcbucket20.txt +51 val_51 srcbucket20.txt +54 val_54 srcbucket23.txt +58 val_58 srcbucket23.txt +58 val_58 srcbucket23.txt +65 val_65 srcbucket23.txt +66 val_66 srcbucket20.txt +69 val_69 srcbucket23.txt +72 val_72 srcbucket23.txt +72 val_72 srcbucket23.txt +76 val_76 srcbucket23.txt +76 val_76 srcbucket23.txt +77 val_77 srcbucket20.txt +8 val_8 srcbucket20.txt +80 val_80 srcbucket20.txt +83 val_83 srcbucket23.txt +83 val_83 srcbucket23.txt +84 val_84 srcbucket20.txt +84 val_84 srcbucket20.txt +87 val_87 srcbucket23.txt +90 val_90 srcbucket23.txt +90 val_90 srcbucket23.txt +90 val_90 srcbucket23.txt +95 val_95 srcbucket20.txt +95 val_95 srcbucket20.txt +98 val_98 srcbucket23.txt +98 val_98 srcbucket23.txt