diff --git common/src/java/org/apache/hadoop/hive/conf/HiveConf.java common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
index 44d9a57..20161e3 100644
--- common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
+++ common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
@@ -631,6 +631,7 @@
HIVEOPTPPD("hive.optimize.ppd", true), // predicate pushdown
HIVEPPDRECOGNIZETRANSITIVITY("hive.ppd.recognizetransivity", true), // predicate pushdown
HIVEPPDREMOVEDUPLICATEFILTERS("hive.ppd.remove.duplicatefilters", true),
+ HIVEPPDFILES("hive.optimize.ppd.vc.filename", false),
HIVEMETADATAONLYQUERIES("hive.optimize.metadataonly", true),
// push predicates down to storage handlers
HIVEOPTPPD_STORAGE("hive.optimize.ppd.storage", true),
diff --git itests/qtest/pom.xml itests/qtest/pom.xml
index dc4519a..216435f 100644
--- itests/qtest/pom.xml
+++ itests/qtest/pom.xml
@@ -39,7 +39,7 @@
stats_counter_partitioned.q,list_bucket_dml_10.q,input16_cc.q,scriptfile1.q,scriptfile1_win.q,bucket4.q,bucketmapjoin6.q,disable_merge_for_bucketing.q,reduce_deduplicate.q,smb_mapjoin_8.q,join1.q,groupby2.q,bucketizedhiveinputformat.q,bucketmapjoin7.q,optrstat_groupby.q,bucket_num_reducers.q,bucket5.q,load_fs2.q,bucket_num_reducers2.q,infer_bucket_sort_merge.q,infer_bucket_sort_reducers_power_two.q,infer_bucket_sort_dyn_part.q,infer_bucket_sort_bucketed_table.q,infer_bucket_sort_map_operators.q,infer_bucket_sort_num_buckets.q,leftsemijoin_mr.q,schemeAuthority.q,schemeAuthority2.q,truncate_column_buckets.q,remote_script.q,,load_hdfs_file_with_space_in_the_name.q,parallel_orderby.q,import_exported_table.q,stats_counter.q,auto_sortmerge_join_16.q,quotedid_smb.q,file_with_header_footer.q,external_table_with_space_in_location_path.q,root_dir_external_table.q,index_bitmap3.q,ql_rewrite_gbtoidx.q,index_bitmap_auto.q,udf_using.q
cluster_tasklog_retrieval.q,minimr_broken_pipe.q,mapreduce_stack_trace.q,mapreduce_stack_trace_turnoff.q,mapreduce_stack_trace_hadoop20.q,mapreduce_stack_trace_turnoff_hadoop20.q,file_with_header_footer_negative.q,udf_local_resource.q
tez_fsstat.q,mapjoin_decimal.q,tez_join_tests.q,tez_joins_explain.q,mrr.q,tez_dml.q,tez_insert_overwrite_local_directory_1.q,tez_union.q,bucket_map_join_tez1.q,bucket_map_join_tez2.q,tez_schema_evolution.q
- cross_product_check_1.q,cross_product_check_2.q,dynpart_sort_opt_vectorization.q,dynpart_sort_optimization.q,orc_analyze.q,join0.q,join1.q,auto_join0.q,auto_join1.q,bucket2.q,bucket3.q,bucket4.q,count.q,create_merge_compressed.q,cross_join.q,ctas.q,custom_input_output_format.q,disable_merge_for_bucketing.q,enforce_order.q,filter_join_breaktask.q,filter_join_breaktask2.q,groupby1.q,groupby2.q,groupby3.q,having.q,insert1.q,insert_into1.q,insert_into2.q,leftsemijoin.q,limit_pushdown.q,load_dyn_part1.q,load_dyn_part2.q,load_dyn_part3.q,mapjoin_mapjoin.q,mapreduce1.q,mapreduce2.q,merge1.q,merge2.q,metadata_only_queries.q,sample1.q,subquery_in.q,subquery_exists.q,vectorization_15.q,ptf.q,stats_counter.q,stats_noscan_1.q,stats_counter_partitioned.q,union2.q,union3.q,union4.q,union5.q,union6.q,union7.q,union8.q,union9.q,transform1.q,transform2.q,transform_ppr1.q,transform_ppr2.q,script_env_var1.q,script_env_var2.q,script_pipe.q,scriptfile1.q
+ cross_product_check_1.q,cross_product_check_2.q,dynpart_sort_opt_vectorization.q,dynpart_sort_optimization.q,orc_analyze.q,join0.q,join1.q,auto_join0.q,auto_join1.q,bucket2.q,bucket3.q,bucket4.q,count.q,create_merge_compressed.q,cross_join.q,ctas.q,custom_input_output_format.q,disable_merge_for_bucketing.q,enforce_order.q,filter_join_breaktask.q,filter_join_breaktask2.q,groupby1.q,groupby2.q,groupby3.q,having.q,insert1.q,insert_into1.q,insert_into2.q,leftsemijoin.q,limit_pushdown.q,load_dyn_part1.q,load_dyn_part2.q,load_dyn_part3.q,mapjoin_mapjoin.q,mapreduce1.q,mapreduce2.q,merge1.q,merge2.q,metadata_only_queries.q,sample1.q,subquery_in.q,subquery_exists.q,vectorization_15.q,ptf.q,stats_counter.q,stats_noscan_1.q,stats_counter_partitioned.q,union2.q,union3.q,union4.q,union5.q,union6.q,union7.q,union8.q,union9.q,transform1.q,transform2.q,transform_ppr1.q,transform_ppr2.q,script_env_var1.q,script_env_var2.q,script_pipe.q,scriptfile1.q,file_pruning.q
add_part_exist.q,alter1.q,alter2.q,alter4.q,alter5.q,alter_rename_partition.q,alter_rename_partition_authorization.q,archive.q,archive_corrupt.q,archive_multi.q,archive_mr_1806.q,archive_multi_mr_1806.q,authorization_1.q,authorization_2.q,authorization_4.q,authorization_5.q,authorization_6.q,authorization_7.q,ba_table1.q,ba_table2.q,ba_table3.q,ba_table_udfs.q,binary_table_bincolserde.q,binary_table_colserde.q,cluster.q,columnarserde_create_shortcut.q,combine2.q,constant_prop.q,create_nested_type.q,create_or_replace_view.q,create_struct_table.q,create_union_table.q,database.q,database_location.q,database_properties.q,ddltime.q,describe_database_json.q,drop_database_removes_partition_dirs.q,escape1.q,escape2.q,exim_00_nonpart_empty.q,exim_01_nonpart.q,exim_02_00_part_empty.q,exim_02_part.q,exim_03_nonpart_over_compat.q,exim_04_all_part.q,exim_04_evolved_parts.q,exim_05_some_part.q,exim_06_one_part.q,exim_07_all_part_over_nonoverlap.q,exim_08_nonpart_rename.q,exim_09_part_spec_nonoverlap.q,exim_10_external_managed.q,exim_11_managed_external.q,exim_12_external_location.q,exim_13_managed_location.q,exim_14_managed_location_over_existing.q,exim_15_external_part.q,exim_16_part_external.q,exim_17_part_managed.q,exim_18_part_external.q,exim_19_00_part_external_location.q,exim_19_part_external_location.q,exim_20_part_managed_location.q,exim_21_export_authsuccess.q,exim_22_import_exist_authsuccess.q,exim_23_import_part_authsuccess.q,exim_24_import_nonexist_authsuccess.q,global_limit.q,groupby_complex_types.q,groupby_complex_types_multi_single_reducer.q,index_auth.q,index_auto.q,index_auto_empty.q,index_bitmap.q,index_bitmap1.q,index_bitmap2.q,index_bitmap3.q,index_bitmap_auto.q,index_bitmap_rc.q,index_compact.q,index_compact_1.q,index_compact_2.q,index_compact_3.q,index_stale_partitioned.q,init_file.q,input16.q,input16_cc.q,input46.q,input_columnarserde.q,input_dynamicserde.q,input_lazyserde.q,input_testxpath3.q,input_testxpath4.q,insert2_overwrite_partitions.q,insertexternal1.q,join_thrift.q,lateral_view.q,load_binary_data.q,load_exist_part_authsuccess.q,load_nonpart_authsuccess.q,load_part_authsuccess.q,loadpart_err.q,lock1.q,lock2.q,lock3.q,lock4.q,merge_dynamic_partition.q,multi_insert.q,multi_insert_move_tasks_share_dependencies.q,null_column.q,ppd_clusterby.q,query_with_semi.q,rename_column.q,sample6.q,sample_islocalmode_hook.q,set_processor_namespaces.q,show_tables.q,source.q,split_sample.q,str_to_map.q,transform1.q,udaf_collect_set.q,udaf_context_ngrams.q,udaf_histogram_numeric.q,udaf_ngrams.q,udaf_percentile_approx.q,udf_array.q,udf_bitmap_and.q,udf_bitmap_or.q,udf_explode.q,udf_format_number.q,udf_map.q,udf_map_keys.q,udf_map_values.q,udf_max.q,udf_min.q,udf_named_struct.q,udf_percentile.q,udf_printf.q,udf_sentences.q,udf_sort_array.q,udf_split.q,udf_struct.q,udf_substr.q,udf_translate.q,udf_union.q,udf_xpath.q,udtf_stack.q,view.q,virtual_column.q
diff --git itests/util/src/main/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFPathName.java itests/util/src/main/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFPathName.java
new file mode 100644
index 0000000..bcf2fb7
--- /dev/null
+++ itests/util/src/main/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFPathName.java
@@ -0,0 +1,87 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.udf.generic;
+
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.ql.exec.Description;
+import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector;
+import org.apache.hadoop.io.Text;
+
+@Description(name = "pathname",
+ value = "_FUNC_(n0) - Returns the final component of input path")
+public class GenericUDFPathName extends GenericUDF {
+
+ StringObjectInspector inputOI;
+
+ @Override
+ public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException {
+ if (arguments.length != 1 || !(arguments[0] instanceof StringObjectInspector)) {
+ throw new UDFArgumentException("pathname accepts one string input");
+ }
+ inputOI = (StringObjectInspector) arguments[0];
+ return PrimitiveObjectInspectorFactory.javaStringObjectInspector;
+ }
+
+ @Override
+ public Object evaluate(DeferredObject[] arguments) throws HiveException {
+
+ if (inputOI.preferWritable()) {
+ Text text = inputOI.getPrimitiveWritableObject(arguments[0].get());
+ if (text == null) {
+ return null;
+ }
+ byte[] bytes = text.getBytes();
+ int i = text.getLength() - 1;
+ for (; i >= 0; i--) {
+ if (bytes[i] == Path.SEPARATOR_CHAR) {
+ text.set(bytes, i + 1, text.getLength() - (i + 1));
+ break;
+ }
+ }
+ return text;
+ }
+ String string = inputOI.getPrimitiveJavaObject(arguments[0].get());
+ if (string == null) {
+ return null;
+ }
+ int index = string.lastIndexOf(Path.SEPARATOR_CHAR);
+ if (index >= 0) {
+ return string.substring(index + 1, string.length());
+ }
+ return string;
+ }
+
+ @Override
+ public String getDisplayString(String[] children) {
+ StringBuilder sb = new StringBuilder();
+ sb.append("pathname(");
+ for (int i = 0; i < children.length; i++) {
+ sb.append(children[i]);
+ if (i + 1 != children.length) {
+ sb.append(",");
+ }
+ }
+ sb.append(")");
+ return sb.toString();
+ }
+}
diff --git ql/src/java/org/apache/hadoop/hive/ql/Context.java ql/src/java/org/apache/hadoop/hive/ql/Context.java
index abc4290..5e07f26 100644
--- ql/src/java/org/apache/hadoop/hive/ql/Context.java
+++ ql/src/java/org/apache/hadoop/hive/ql/Context.java
@@ -37,9 +37,7 @@
import org.apache.hadoop.hive.ql.lockmgr.HiveLockObj;
import org.apache.hadoop.hive.ql.lockmgr.HiveTxnManager;
import org.apache.hadoop.hive.ql.plan.LoadTableDesc;
-import org.apache.hadoop.hive.ql.session.SessionState;
import org.apache.hadoop.hive.shims.ShimLoader;
-import org.apache.hadoop.security.UserGroupInformation;
import org.apache.hadoop.util.StringUtils;
import java.io.DataInput;
@@ -54,8 +52,6 @@
import java.util.Random;
import java.util.concurrent.ConcurrentHashMap;
-import javax.security.auth.login.LoginException;
-
/**
* Context for Semantic Analyzers. Usage: not reusable - construct a new one for
* each query should call clear() at end of use to remove temporary folders
diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java
index a80feb9..c3e5835 100644
--- ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java
+++ ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java
@@ -1509,14 +1509,25 @@ public static Method getMethodInternal(Class> udfClass, List mlist, bo
* out of array and getting values out of map.
*/
public static GenericUDF getGenericUDFForIndex() {
- return FunctionRegistry.getFunctionInfo("index").getGenericUDF();
+ return getGenericUDF("index");
}
/**
* A shortcut to get the "and" GenericUDF.
*/
public static GenericUDF getGenericUDFForAnd() {
- return FunctionRegistry.getFunctionInfo("and").getGenericUDF();
+ return getGenericUDF("and");
+ }
+
+ /**
+ * A shortcut to get the "or" GenericUDF.
+ */
+ public static GenericUDF getGenericUDFForOr() {
+ return getGenericUDF("or");
+ }
+
+ public static GenericUDF getGenericUDF(String name) {
+ return FunctionRegistry.getFunctionInfo(name).getGenericUDF();
}
/**
diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/Task.java ql/src/java/org/apache/hadoop/hive/ql/exec/Task.java
index 622ee45..d39c980 100644
--- ql/src/java/org/apache/hadoop/hive/ql/exec/Task.java
+++ ql/src/java/org/apache/hadoop/hive/ql/exec/Task.java
@@ -515,11 +515,12 @@ public void shutdown() {
public List getResultSchema() {
return null;
}
- Throwable getException() {
+
+ protected Throwable getException() {
return exception;
}
- void setException(Throwable ex) {
+ protected void setException(Throwable ex) {
exception = ex;
}
diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java
index 29d59a4..c8a5f29 100644
--- ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java
+++ ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java
@@ -2170,8 +2170,9 @@ public static void copyTableJobPropertiesToConf(TableDesc tbl, JobConf job) {
* @return the summary of all the input paths.
* @throws IOException
*/
+ @Deprecated
public static ContentSummary getInputSummary(final Context ctx, MapWork work, PathFilter filter)
- throws IOException {
+ throws IOException, HiveException {
PerfLogger perfLogger = PerfLogger.getPerfLogger();
perfLogger.PerfLogBegin(CLASS_NAME, PerfLogger.INPUT_SUMMARY);
@@ -2367,31 +2368,6 @@ public static long sumOfExcept(Map aliasToSize,
return total;
}
- public static boolean isEmptyPath(JobConf job, Path dirPath, Context ctx)
- throws Exception {
- ContentSummary cs = ctx.getCS(dirPath);
- if (cs != null) {
- LOG.info("Content Summary " + dirPath + "length: " + cs.getLength() + " num files: "
- + cs.getFileCount() + " num directories: " + cs.getDirectoryCount());
- return (cs.getLength() == 0 && cs.getFileCount() == 0 && cs.getDirectoryCount() <= 1);
- } else {
- LOG.info("Content Summary not cached for " + dirPath);
- }
- return isEmptyPath(job, dirPath);
- }
-
- public static boolean isEmptyPath(JobConf job, Path dirPath) throws Exception {
- FileSystem inpFs = dirPath.getFileSystem(job);
-
- if (inpFs.exists(dirPath)) {
- FileStatus[] fStats = inpFs.listStatus(dirPath);
- if (fStats.length > 0) {
- return false;
- }
- }
- return true;
- }
-
public static List getTezTasks(List> tasks) {
List tezTasks = new ArrayList();
if (tasks != null) {
@@ -3003,8 +2979,7 @@ public static double getHighestSamplePercentage (MapWork work) {
* so we don't want to depend on scratch dir and context.
*/
public static List getInputPathsTez(JobConf job, MapWork work) throws Exception {
- List paths = getInputPaths(job, work, null, null);
- return paths;
+ return getInputPaths(job, work, null);
}
/**
@@ -3021,39 +2996,42 @@ public static double getHighestSamplePercentage (MapWork work) {
* @return List of paths to process for the given MapWork
* @throws Exception
*/
- public static List getInputPaths(JobConf job, MapWork work, Path hiveScratchDir, Context ctx)
+ public static List getInputPaths(JobConf job, MapWork work, Path hiveScratchDir)
throws Exception {
int sequenceNumber = 0;
- Set pathsProcessed = new HashSet();
+ Set pathsProcessed = new HashSet();
List pathsToAdd = new LinkedList();
// AliasToWork contains all the aliases
- for (String alias : work.getAliasToWork().keySet()) {
+ Map> aliasToWork = work.getAliasToWork();
+ Map> pathToAliases =
+ new LinkedHashMap>(work.getPathToAliases());
+
+ for (String alias : aliasToWork.keySet()) {
LOG.info("Processing alias " + alias);
// The alias may not have any path
- Path path = null;
- for (String file : new LinkedList(work.getPathToAliases().keySet())) {
- List aliases = work.getPathToAliases().get(file);
+ String path = null;
+ for (Map.Entry> entry : pathToAliases.entrySet()) {
+ path = entry.getKey();
+ List aliases = entry.getValue();
if (aliases.contains(alias)) {
- path = new Path(file);
-
// Multiple aliases can point to the same path - it should be
// processed only once
- if (pathsProcessed.contains(path)) {
+ if (!pathsProcessed.add(path)) {
continue;
}
- pathsProcessed.add(path);
-
- LOG.info("Adding input file " + path);
- if (!HiveConf.getVar(job, ConfVars.HIVE_EXECUTION_ENGINE).equals("tez")
- && isEmptyPath(job, path, ctx)) {
- path = createDummyFileForEmptyPartition(path, job, work,
- hiveScratchDir, alias, sequenceNumber++);
+ LOG.info("Adding input path " + path);
+ List adding;
+ if (!work.pathExists(path, job) && hiveScratchDir != null) {
+ adding = Arrays.asList(createDummyFileForEmptyPartition(path, job, work,
+ hiveScratchDir, alias, sequenceNumber++));
+ } else {
+ adding = work.getPrunedPaths(path, job);
}
- pathsToAdd.add(path);
+ pathsToAdd.addAll(adding);
}
}
@@ -3065,11 +3043,9 @@ public static double getHighestSamplePercentage (MapWork work) {
// T2) x;
// If T is empty and T2 contains 100 rows, the user expects: 0, 100 (2
// rows)
- if (path == null
- && !HiveConf.getVar(job, ConfVars.HIVE_EXECUTION_ENGINE).equals("tez")) {
- path = createDummyFileForEmptyTable(job, work, hiveScratchDir,
- alias, sequenceNumber++);
- pathsToAdd.add(path);
+ if (path == null && hiveScratchDir != null) {
+ pathsToAdd.add(createDummyFileForEmptyTable(job, work, hiveScratchDir,
+ alias, sequenceNumber++));
}
}
return pathsToAdd;
@@ -3107,12 +3083,10 @@ private static Path createEmptyFile(Path hiveScratchDir,
}
@SuppressWarnings("rawtypes")
- private static Path createDummyFileForEmptyPartition(Path path, JobConf job, MapWork work,
+ private static Path createDummyFileForEmptyPartition(String strPath, JobConf job, MapWork work,
Path hiveScratchDir, String alias, int sequenceNumber)
throws IOException, InstantiationException, IllegalAccessException {
- String strPath = path.toString();
-
// The input file does not exist, replace it by a empty file
PartitionDesc partDesc = work.getPathToPartitionInfo().get(strPath);
boolean nonNative = partDesc.getTableDesc().isNonNative();
@@ -3122,7 +3096,7 @@ private static Path createDummyFileForEmptyPartition(Path path, JobConf job, Map
if (nonNative) {
// if this isn't a hive table we can't create an empty file for it.
- return path;
+ return new Path(strPath);
}
Path newPath = createEmptyFile(hiveScratchDir, outFileFormat, job,
diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/mr/ExecDriver.java ql/src/java/org/apache/hadoop/hive/ql/exec/mr/ExecDriver.java
index 1095173..926d979 100644
--- ql/src/java/org/apache/hadoop/hive/ql/exec/mr/ExecDriver.java
+++ ql/src/java/org/apache/hadoop/hive/ql/exec/mr/ExecDriver.java
@@ -36,6 +36,7 @@
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
+import org.apache.hadoop.fs.ContentSummary;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
@@ -103,11 +104,13 @@
private static final long serialVersionUID = 1L;
private static final String JOBCONF_FILENAME = "jobconf.xml";
- protected transient JobConf job;
- public static MemoryMXBean memoryMXBean;
+ protected static transient final Log LOG = LogFactory.getLog(ExecDriver.class);
+ protected static MemoryMXBean memoryMXBean;
+
protected HadoopJobExecHelper jobExecHelper;
- protected static transient final Log LOG = LogFactory.getLog(ExecDriver.class);
+ protected transient JobConf job;
+ protected transient ContentSummary inputSummary;
private RunningJob rj;
@@ -120,6 +123,13 @@ public ExecDriver() {
this.jobExecHelper = new HadoopJobExecHelper(job, console, this, this);
}
+ public ContentSummary getInputSummary(Context ctx) throws IOException, HiveException {
+ if (inputSummary == null) {
+ inputSummary = getWork().getTotalSummary(ctx, job);
+ }
+ return inputSummary;
+ }
+
@Override
public boolean requireLock() {
return true;
@@ -219,7 +229,7 @@ public int execute(DriverContext driverContext) {
FileSystem fs = emptyScratchDir.getFileSystem(job);
fs.mkdirs(emptyScratchDir);
} catch (IOException e) {
- e.printStackTrace();
+ setException(e);
console.printError("Error launching map-reduce job", "\n"
+ org.apache.hadoop.util.StringUtils.stringifyException(e));
return 5;
@@ -366,7 +376,7 @@ public int execute(DriverContext driverContext) {
}
}
work.configureJobConf(job);
- List inputPaths = Utilities.getInputPaths(job, mWork, emptyScratchDir, ctx);
+ List inputPaths = Utilities.getInputPaths(job, mWork, emptyScratchDir);
Utilities.setInputPaths(job, inputPaths);
Utilities.setMapRedWork(job, work, ctx.getMRTmpPath());
@@ -426,7 +436,7 @@ public int execute(DriverContext driverContext) {
returnVal = jobExecHelper.progress(rj, jc, ctx.getHiveTxnManager());
success = (returnVal == 0);
} catch (Exception e) {
- e.printStackTrace();
+ setException(e);
String mesg = " with exception '" + Utilities.getNameMessage(e) + "'";
if (rj != null) {
mesg = "Ended Job = " + rj.getJobID() + mesg;
@@ -471,6 +481,7 @@ public int execute(DriverContext driverContext) {
}
}
} catch (Exception e) {
+ setException(e);
// jobClose needs to execute successfully otherwise fail task
if (success) {
success = false;
diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/mr/MapRedTask.java ql/src/java/org/apache/hadoop/hive/ql/exec/mr/MapRedTask.java
index a9869f7..cb3f5dd 100644
--- ql/src/java/org/apache/hadoop/hive/ql/exec/mr/MapRedTask.java
+++ ql/src/java/org/apache/hadoop/hive/ql/exec/mr/MapRedTask.java
@@ -39,6 +39,7 @@
import org.apache.hadoop.hive.ql.exec.Operator;
import org.apache.hadoop.hive.ql.exec.Utilities;
import org.apache.hadoop.hive.ql.exec.Utilities.StreamPrinter;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.plan.MapredWork;
import org.apache.hadoop.hive.ql.plan.OperatorDesc;
import org.apache.hadoop.hive.ql.plan.ReduceWork;
@@ -63,7 +64,6 @@
static final String HIVE_CHILD_CLIENT_DEBUG_OPTS = "HIVE_CHILD_CLIENT_DEBUG_OPTS";
static final String[] HIVE_SYS_PROP = {"build.dir", "build.dir.hive", "hive.query.id"};
- private transient ContentSummary inputSummary = null;
private transient boolean runningViaChild = false;
private transient long totalInputFileSize;
@@ -88,15 +88,13 @@ public int execute(DriverContext driverContext) {
}
// estimate number of reducers
- setNumberOfReducers();
+ setNumberOfReducers(ctx);
// auto-determine local mode if allowed
if (!ctx.isLocalOnlyExecutionMode() &&
conf.getBoolVar(HiveConf.ConfVars.LOCALMODEAUTO)) {
- if (inputSummary == null) {
- inputSummary = Utilities.getInputSummary(driverContext.getCtx(), work.getMapWork(), null);
- }
+ ContentSummary inputSummary = getInputSummary(ctx);
// set the values of totalInputFileSize and totalInputNumFiles, estimating them
// if percentage block sampling is being used
@@ -376,7 +374,7 @@ public boolean reduceDone() {
/**
* Set the number of reducers for the mapred work.
*/
- private void setNumberOfReducers() throws IOException {
+ private void setNumberOfReducers(Context ctx) throws IOException, HiveException {
ReduceWork rWork = work.getReduceWork();
// this is a temporary hack to fix things that are not fixed in the compiler
Integer numReducersFromWork = rWork == null ? 0 : rWork.getNumReduceTasks();
@@ -395,9 +393,7 @@ private void setNumberOfReducers() throws IOException {
.printInfo("Number of reduce tasks not specified. Defaulting to jobconf value of: "
+ reducers);
} else {
- if (inputSummary == null) {
- inputSummary = Utilities.getInputSummary(driverContext.getCtx(), work.getMapWork(), null);
- }
+ ContentSummary inputSummary = getInputSummary(ctx);
int reducers = Utilities.estimateNumberOfReducers(conf, inputSummary, work.getMapWork(),
work.isFinalMapRed());
rWork.setNumReduceTasks(reducers);
diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/tez/TezTask.java ql/src/java/org/apache/hadoop/hive/ql/exec/tez/TezTask.java
index 949bcfb..d5f3454 100644
--- ql/src/java/org/apache/hadoop/hive/ql/exec/tez/TezTask.java
+++ ql/src/java/org/apache/hadoop/hive/ql/exec/tez/TezTask.java
@@ -28,7 +28,6 @@
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.ql.Context;
import org.apache.hadoop.hive.ql.DriverContext;
import org.apache.hadoop.hive.ql.exec.Operator;
@@ -171,6 +170,7 @@ public int execute(DriverContext driverContext) {
}
}
} catch (Exception e) {
+ setException(e);
LOG.error("Failed to execute tez graph.", e);
// rc will be 1 at this point indicating failure.
} finally {
diff --git ql/src/java/org/apache/hadoop/hive/ql/index/IndexPredicateAnalyzer.java ql/src/java/org/apache/hadoop/hive/ql/index/IndexPredicateAnalyzer.java
index 683618f..4674098 100644
--- ql/src/java/org/apache/hadoop/hive/ql/index/IndexPredicateAnalyzer.java
+++ ql/src/java/org/apache/hadoop/hive/ql/index/IndexPredicateAnalyzer.java
@@ -62,6 +62,9 @@
private boolean acceptsFields;
+ private boolean allowAllColumns;
+ private boolean allowAllFunctions;
+
public IndexPredicateAnalyzer() {
udfNames = new HashSet();
allowedColumnNames = new HashSet();
@@ -71,6 +74,14 @@ public void setFieldValidator(FieldValidator fieldValidator) {
this.fieldValidator = fieldValidator;
}
+ public void setAllowAllColumns(boolean allowAllColumns) {
+ this.allowAllColumns = allowAllColumns;
+ }
+
+ public void setAllowAllFunctions(boolean allowAllFunctions) {
+ this.allowAllFunctions = allowAllFunctions;
+ }
+
/**
* Registers a comparison operator as one which can be satisfied
* by an index search. Unless this is called, analyzePredicate
@@ -199,11 +210,11 @@ private ExprNodeDesc analyzeExpr(
}
String udfName = genericUDF.getUdfName();
- if (!udfNames.contains(genericUDF.getUdfName())) {
+ if (!allowAllFunctions && !udfNames.contains(genericUDF.getUdfName())) {
return expr;
}
- if (!allowedColumnNames.contains(columnDesc.getColumn())) {
+ if (!allowAllColumns && !allowedColumnNames.contains(columnDesc.getColumn())) {
return expr;
}
diff --git ql/src/java/org/apache/hadoop/hive/ql/io/HiveFileFormatUtils.java ql/src/java/org/apache/hadoop/hive/ql/io/HiveFileFormatUtils.java
index c3a83d4..3417e1d 100644
--- ql/src/java/org/apache/hadoop/hive/ql/io/HiveFileFormatUtils.java
+++ ql/src/java/org/apache/hadoop/hive/ql/io/HiveFileFormatUtils.java
@@ -19,7 +19,6 @@
package org.apache.hadoop.hive.ql.io;
import java.io.IOException;
-import java.io.Serializable;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
@@ -35,7 +34,6 @@
import org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter;
import org.apache.hadoop.hive.ql.exec.Operator;
import org.apache.hadoop.hive.ql.exec.Utilities;
-import org.apache.hadoop.hive.ql.io.HivePassThroughOutputFormat;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.plan.FileSinkDesc;
import org.apache.hadoop.hive.ql.plan.OperatorDesc;
@@ -190,7 +188,7 @@ public static synchronized void registerInputFormatChecker(
* get an InputFormatChecker for a file format.
*/
public static synchronized Class extends InputFormatChecker> getInputFormatChecker(
- Class> inputFormat) {
+ Class extends InputFormat> inputFormat) {
Class extends InputFormatChecker> result = inputFormatCheckerMap
.get(inputFormat);
return result;
@@ -428,7 +426,6 @@ private static String getMatchingPath(Map> pathToAlias
if (foundAlias(pathToAliases, dirPath)) {
return dirPath;
}
- path = dirPath;
String dirStr = dir.toString();
int dirPathIndex = dirPath.lastIndexOf(Path.SEPARATOR);
diff --git ql/src/java/org/apache/hadoop/hive/ql/io/HiveInputFormat.java ql/src/java/org/apache/hadoop/hive/ql/io/HiveInputFormat.java
index 61cc874..dee7ab5 100755
--- ql/src/java/org/apache/hadoop/hive/ql/io/HiveInputFormat.java
+++ ql/src/java/org/apache/hadoop/hive/ql/io/HiveInputFormat.java
@@ -330,7 +330,8 @@ private void addSplitsForGroup(List dirs, TableScanOperator tableScan, Job
get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, ""));
// for each dir, get the InputFormat, and do getSplits.
for (Path dir : dirs) {
- PartitionDesc part = getPartitionDescFromPath(pathToPartitionInfo, dir);
+ PartitionDesc part = HiveFileFormatUtils.
+ getPartitionDescFromPathRecursively(pathToPartitionInfo, dir, null);
Class extends InputFormat> inputFormatClass = part.getInputFileFormatClass();
TableDesc table = part.getTableDesc();
TableScanOperator tableScan = null;
diff --git ql/src/java/org/apache/hadoop/hive/ql/metadata/FilePruningPredicateHandler.java ql/src/java/org/apache/hadoop/hive/ql/metadata/FilePruningPredicateHandler.java
new file mode 100644
index 0000000..ccb79f9
--- /dev/null
+++ ql/src/java/org/apache/hadoop/hive/ql/metadata/FilePruningPredicateHandler.java
@@ -0,0 +1,53 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.metadata;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.hadoop.hive.ql.index.IndexPredicateAnalyzer;
+import org.apache.hadoop.hive.ql.index.IndexSearchCondition;
+import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
+import org.apache.hadoop.hive.serde2.Deserializer;
+import org.apache.hadoop.mapred.JobConf;
+
+/**
+ * Extracts file pruning expression (used by MapredWork)
+ */
+public class FilePruningPredicateHandler implements HiveStoragePredicateHandler {
+
+ @Override
+ public DecomposedPredicate decomposePredicate(JobConf jobConf, Deserializer deserializer,
+ ExprNodeDesc predicate) {
+ IndexPredicateAnalyzer analyzer = new IndexPredicateAnalyzer();
+ analyzer.allowColumnName(VirtualColumn.FILENAME.getName());
+ analyzer.setAllowAllFunctions(true);
+
+ List searchConditions = new ArrayList();
+ ExprNodeDesc residual = analyzer.analyzePredicate(predicate, searchConditions);
+
+ // there is no assertion that pushedPredicate to be applied always, so residue all of them.
+ DecomposedPredicate decomposedPredicate = new DecomposedPredicate();
+ decomposedPredicate.pushedPredicate = analyzer.translateSearchConditions(searchConditions);
+
+ // keep all (todo: can be removed when it's referenced by one alias)
+ decomposedPredicate.residualPredicate = predicate;
+ return decomposedPredicate;
+ }
+}
diff --git ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveStoragePredicateHandler.java ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveStoragePredicateHandler.java
index 7d7c764..67927b8 100644
--- ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveStoragePredicateHandler.java
+++ ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveStoragePredicateHandler.java
@@ -18,10 +18,9 @@
package org.apache.hadoop.hive.ql.metadata;
+import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
-import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc;
import org.apache.hadoop.hive.serde2.Deserializer;
-import org.apache.hadoop.mapred.JobConf;
import java.io.Serializable;
@@ -68,7 +67,7 @@ public DecomposedPredicate decomposePredicate(
* Portion of predicate to be evaluated by storage handler. Hive
* will pass this into the storage handler's input format.
*/
- public ExprNodeGenericFuncDesc pushedPredicate;
+ public ExprNodeDesc pushedPredicate;
/**
* Serialized format for filter
@@ -79,6 +78,6 @@ public DecomposedPredicate decomposePredicate(
* Portion of predicate to be post-evaluated by Hive for any rows
* which are returned by storage handler.
*/
- public ExprNodeGenericFuncDesc residualPredicate;
+ public ExprNodeDesc residualPredicate;
}
}
diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/AbstractJoinTaskDispatcher.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/AbstractJoinTaskDispatcher.java
index 33ef581..177ff2d 100644
--- ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/AbstractJoinTaskDispatcher.java
+++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/AbstractJoinTaskDispatcher.java
@@ -124,10 +124,6 @@ public long getTotalKnownInputSize(Context context, MapWork currWork,
Map> pathToAliases,
HashMap aliasToSize) throws SemanticException {
try {
- // go over all the input paths, and calculate a known total size, known
- // size for each input alias.
- Utilities.getInputSummary(context, currWork, null).getLength();
-
// set alias to size mapping, this can be used to determine if one table
// is chosen as big table, what's the total size of left tables, which
// are going to be small tables.
@@ -135,7 +131,7 @@ public long getTotalKnownInputSize(Context context, MapWork currWork,
for (Map.Entry> entry : pathToAliases.entrySet()) {
String path = entry.getKey();
List aliasList = entry.getValue();
- ContentSummary cs = context.getCS(path);
+ ContentSummary cs = currWork.getSummaryFor(path, context.getConf());
if (cs != null) {
long size = cs.getLength();
for (String alias : aliasList) {
diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/index/IndexWhereProcessor.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/index/IndexWhereProcessor.java
index 5c6751c..a665345 100644
--- ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/index/IndexWhereProcessor.java
+++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/index/IndexWhereProcessor.java
@@ -18,7 +18,6 @@
package org.apache.hadoop.hive.ql.optimizer.physical.index;
-import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.LinkedHashSet;
@@ -33,7 +32,6 @@
import org.apache.hadoop.hive.metastore.api.Index;
import org.apache.hadoop.hive.ql.exec.TableScanOperator;
import org.apache.hadoop.hive.ql.exec.Task;
-import org.apache.hadoop.hive.ql.exec.Utilities;
import org.apache.hadoop.hive.ql.exec.mr.MapRedTask;
import org.apache.hadoop.hive.ql.hooks.ReadEntity;
import org.apache.hadoop.hive.ql.index.HiveIndexHandler;
@@ -204,14 +202,15 @@ private void rewriteForIndexes(ExprNodeDesc predicate, List indexes,
}
// check the size
+ MapredWork work = task.getWork();
try {
- ContentSummary inputSummary = Utilities.getInputSummary(pctx.getContext(), task.getWork().getMapWork(), null);
+ ContentSummary inputSummary = work.getTotalSummary(pctx.getContext(), pctx.getConf());
long inputSize = inputSummary.getLength();
if (!indexHandler.checkQuerySize(inputSize, pctx.getConf())) {
queryContext.setQueryTasks(null);
return;
}
- } catch (IOException e) {
+ } catch (Exception e) {
throw new SemanticException("Failed to get task size", e);
}
diff --git ql/src/java/org/apache/hadoop/hive/ql/parse/MapReduceCompiler.java ql/src/java/org/apache/hadoop/hive/ql/parse/MapReduceCompiler.java
index 703c9d1..d7212f6 100644
--- ql/src/java/org/apache/hadoop/hive/ql/parse/MapReduceCompiler.java
+++ ql/src/java/org/apache/hadoop/hive/ql/parse/MapReduceCompiler.java
@@ -18,7 +18,6 @@
package org.apache.hadoop.hive.ql.parse;
-import java.io.IOException;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.HashMap;
@@ -205,9 +204,9 @@ public boolean accept(Path file) {
boolean hasNonLocalJob = false;
for (ExecDriver mrtask : mrtasks) {
try {
- ContentSummary inputSummary = Utilities.getInputSummary
- (ctx, mrtask.getWork().getMapWork(), p);
- int numReducers = getNumberOfReducers(mrtask.getWork(), conf);
+ MapredWork work = mrtask.getWork();
+ ContentSummary inputSummary = work.getTotalSummary(lCtx, ctx.getConf());
+ int numReducers = getNumberOfReducers(work, conf);
long estimatedInput;
@@ -240,7 +239,7 @@ public boolean accept(Path file) {
} else {
mrtask.setLocalMode(true);
}
- } catch (IOException e) {
+ } catch (Exception e) {
throw new SemanticException(e);
}
}
diff --git ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java
index 83d09c0..a8a3103 100644
--- ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java
+++ ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java
@@ -2357,8 +2357,8 @@ private Operator genNotNullFilterForJoinSourcePlan(QB qb, Operator input,
args.add(joinKeys[i]);
ExprNodeDesc nextExpr = ExprNodeGenericFuncDesc.newInstance(
FunctionRegistry.getFunctionInfo("isnotnull").getGenericUDF(), args);
- filterPred = filterPred == null ? nextExpr : ExprNodeDescUtils
- .mergePredicates(filterPred, nextExpr);
+ filterPred = filterPred == null ? nextExpr :
+ ExprNodeDescUtils.andPredicates(filterPred, nextExpr);
}
if (filterPred == null) {
diff --git ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeDescUtils.java ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeDescUtils.java
index f293c43..8d6f0cc 100644
--- ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeDescUtils.java
+++ ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeDescUtils.java
@@ -19,6 +19,7 @@
package org.apache.hadoop.hive.ql.plan;
import java.util.ArrayList;
+import java.util.Arrays;
import java.util.List;
import java.util.Map;
@@ -32,6 +33,7 @@
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFBridge;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils;
+import org.apache.hadoop.hive.serde.serdeConstants;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
import org.apache.hadoop.util.ReflectionUtils;
@@ -99,10 +101,30 @@ public static boolean containsPredicate(ExprNodeDesc source, ExprNodeDesc predic
return false;
}
+ public static ExprNodeGenericFuncDesc toPredicate(ExprNodeDesc predicate) {
+ if (predicate instanceof ExprNodeGenericFuncDesc) {
+ return (ExprNodeGenericFuncDesc) predicate;
+ }
+ return new ExprNodeGenericFuncDesc(TypeInfoFactory.booleanTypeInfo,
+ FunctionRegistry.getGenericUDF(serdeConstants.BOOLEAN_TYPE_NAME),
+ new ArrayList(Arrays.asList(predicate)));
+ }
+
+ /**
+ * bind two predicates by OR op
+ */
+ public static ExprNodeDesc orPredicates(ExprNodeDesc prev, ExprNodeDesc next) {
+ List children = new ArrayList(2);
+ children.add(prev);
+ children.add(next);
+ return new ExprNodeGenericFuncDesc(TypeInfoFactory.booleanTypeInfo,
+ FunctionRegistry.getGenericUDFForOr(), children);
+ }
+
/**
* bind two predicates by AND op
*/
- public static ExprNodeGenericFuncDesc mergePredicates(ExprNodeDesc prev, ExprNodeDesc next) {
+ public static ExprNodeDesc andPredicates(ExprNodeDesc prev, ExprNodeDesc next) {
List children = new ArrayList(2);
children.add(prev);
children.add(next);
@@ -120,7 +142,7 @@ public static ExprNodeDesc mergePredicates(List exprs) {
prev = expr;
continue;
}
- prev = mergePredicates(prev, expr);
+ prev = andPredicates(prev, expr);
}
return prev;
}
@@ -133,7 +155,7 @@ public static ExprNodeDesc mergePredicates(List exprs) {
}
/**
- * split predicates by AND op
+ * split 'current' by AND op and append to 'splitted'
*/
public static List split(ExprNodeDesc current, List splitted) {
if (FunctionRegistry.isOpAnd(current)) {
diff --git ql/src/java/org/apache/hadoop/hive/ql/plan/MapWork.java ql/src/java/org/apache/hadoop/hive/ql/plan/MapWork.java
index 9945dea..70f8cb8 100644
--- ql/src/java/org/apache/hadoop/hive/ql/plan/MapWork.java
+++ ql/src/java/org/apache/hadoop/hive/ql/plan/MapWork.java
@@ -18,6 +18,9 @@
package org.apache.hadoop.hive.ql.plan;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.net.URI;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
@@ -32,16 +35,40 @@
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.ContentSummary;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.conf.HiveConf;
+import org.apache.hadoop.hive.metastore.api.hive_metastoreConstants;
+import org.apache.hadoop.hive.ql.Context;
+import org.apache.hadoop.hive.ql.exec.ExprNodeEvaluator;
+import org.apache.hadoop.hive.ql.exec.ExprNodeEvaluatorFactory;
import org.apache.hadoop.hive.ql.exec.FileSinkOperator;
import org.apache.hadoop.hive.ql.exec.Operator;
import org.apache.hadoop.hive.ql.exec.OperatorUtils;
+import org.apache.hadoop.hive.ql.exec.TableScanOperator;
+import org.apache.hadoop.hive.ql.exec.Utilities;
+import org.apache.hadoop.hive.ql.io.ContentSummaryInputFormat;
+import org.apache.hadoop.hive.ql.io.HiveFileFormatUtils;
+import org.apache.hadoop.hive.ql.io.HiveInputFormat;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.ql.metadata.HiveStorageHandler;
+import org.apache.hadoop.hive.ql.metadata.HiveUtils;
+import org.apache.hadoop.hive.ql.metadata.InputEstimator;
+import org.apache.hadoop.hive.ql.metadata.VirtualColumn;
import org.apache.hadoop.hive.ql.optimizer.physical.BucketingSortingCtx.BucketCol;
import org.apache.hadoop.hive.ql.optimizer.physical.BucketingSortingCtx.SortCol;
import org.apache.hadoop.hive.ql.parse.OpParseContext;
import org.apache.hadoop.hive.ql.parse.QBJoinTree;
import org.apache.hadoop.hive.ql.parse.SplitSample;
+import org.apache.hadoop.hive.serde2.SerDeUtils;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.BooleanObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
+import org.apache.hadoop.mapred.InputFormat;
import org.apache.hadoop.mapred.JobConf;
/**
@@ -308,7 +335,8 @@ public String getVectorModeOn() {
@Override
public void replaceRoots(Map, Operator>> replacementMap) {
- LinkedHashMap> newAliasToWork = new LinkedHashMap>();
+ LinkedHashMap> newAliasToWork =
+ new LinkedHashMap>();
for (Map.Entry> entry: aliasToWork.entrySet()) {
newAliasToWork.put(entry.getKey(), replacementMap.get(entry.getValue()));
@@ -534,4 +562,235 @@ public void setVectorMode(boolean vectorMode) {
this.vectorMode = vectorMode;
}
+ private static final ObjectInspector VC_FILE_OI =
+ ObjectInspectorFactory.getStandardStructObjectInspector(
+ Arrays.asList(VirtualColumn.FILENAME.getName()),
+ Arrays.asList((ObjectInspector) PrimitiveObjectInspectorFactory.javaStringObjectInspector));
+
+ private static final BooleanObjectInspector EVAL_OI =
+ PrimitiveObjectInspectorFactory.writableBooleanObjectInspector;
+
+ private transient Map pathToEvals;
+ private transient Map summaries;
+
+ public ContentSummary getSummaryFor(String inputPath, Configuration conf)
+ throws IOException, HiveException {
+ return getInputSummaryFor(inputPath, conf).toContentSummary();
+ }
+
+ public boolean pathExists(String inputPath, Configuration conf) throws IOException, HiveException {
+ return getInputSummaryFor(inputPath, conf).exists;
+ }
+
+ // if file pruning is applied, return paths passed the filter. if not, return input path
+ public List getPrunedPaths(String inputPath, Configuration conf)
+ throws IOException, HiveException {
+ return getInputSummaryFor(inputPath, conf).paths;
+ }
+
+ private InputSummary getInputSummaryFor(String inputPath, Configuration conf)
+ throws IOException, HiveException {
+ InputSummary summary = summaries == null ? null : summaries.get(inputPath);
+ if (summary == null) {
+ ExprNodeEvaluator evaluator = getPathToEvals().get(inputPath);
+ if (summaries == null) {
+ summaries = new HashMap();
+ }
+ summaries.put(inputPath, summary = summarize(inputPath, conf, evaluator));
+ }
+ return summary;
+ }
+
+ // get summaries for all input paths and return total of them
+ public ContentSummary getTotalSummary(Context ctx, Configuration conf)
+ throws IOException, HiveException {
+ long length = 0;
+ long fileCount = 0;
+ long directoryCount = 0;
+ for (String path : pathToAliases.keySet()) {
+ ContentSummary pathSummary = getSummaryFor(path, conf);
+ if (pathSummary != null) {
+ if (pathSummary.getLength() > 0) {
+ length += pathSummary.getLength();
+ }
+ if (pathSummary.getFileCount() > 0) {
+ fileCount += pathSummary.getFileCount();
+ }
+ if (pathSummary.getDirectoryCount() > 0) {
+ directoryCount += pathSummary.getDirectoryCount();
+ }
+ ctx.addCS(path, pathSummary);
+ }
+ }
+ return new ContentSummary(length, fileCount, directoryCount);
+ }
+
+ // return or-conjuncted file pruning filter
+ private ExprNodeEvaluator toFilter(List> operators) throws HiveException {
+ ExprNodeDesc prev = null;
+ for (Operator> operator : operators) {
+ if (operator instanceof TableScanOperator && operator.getConf() != null) {
+ ExprNodeDesc filterExpr = ((TableScanOperator) operator).getConf().getFileFilterExpr();
+ if (filterExpr == null) {
+ continue;
+ }
+ if (prev == null) {
+ prev = filterExpr;
+ } else {
+ prev = ExprNodeDescUtils.orPredicates(prev, filterExpr);
+ }
+ }
+ }
+ if (prev == null) {
+ return null;
+ }
+ ExprNodeEvaluator evaluator = ExprNodeEvaluatorFactory.get(prev);
+ evaluator.initialize(VC_FILE_OI);
+ return evaluator;
+ }
+
+ // evaluate input path with file pruning filter
+ private InputSummary summarize(String pathStr, Configuration conf, ExprNodeEvaluator evaluator)
+ throws IOException, HiveException {
+ Path path = new Path(pathStr);
+ PartitionDesc partDesc = pathToPartitionInfo.get(pathStr);
+ Class extends InputFormat> format = partDesc.getInputFileFormatClass();
+ if (ContentSummaryInputFormat.class.isAssignableFrom(format)) {
+ JobConf jobConf = new JobConf(conf);
+ ContentSummaryInputFormat summaryInput = (ContentSummaryInputFormat)
+ HiveInputFormat.getInputFormatFromCache(format, jobConf);
+ ContentSummary content = summaryInput.getContentSummary(path, jobConf);
+ return new InputSummary(path, content);
+ }
+ if (evaluator == null) {
+ HiveStorageHandler handler = HiveUtils.getStorageHandler(conf,
+ SerDeUtils.createOverlayedProperties(
+ partDesc.getTableDesc().getProperties(),
+ partDesc.getProperties()).getProperty(hive_metastoreConstants.META_TABLE_STORAGE));
+ if (handler instanceof InputEstimator) {
+ long total = 0;
+ TableDesc tableDesc = partDesc.getTableDesc();
+ InputEstimator estimator = (InputEstimator) handler;
+ for (String alias : HiveFileFormatUtils.doGetAliasesFromPath(pathToAliases, path)) {
+ JobConf jobConf = new JobConf(conf);
+ TableScanOperator scanOp = (TableScanOperator) aliasToWork.get(alias);
+ Utilities.setColumnNameList(jobConf, scanOp, true);
+ Utilities.setColumnTypeList(jobConf, scanOp, true);
+ PlanUtils.configureInputJobPropertiesForStorageHandler(tableDesc);
+ Utilities.copyTableJobPropertiesToConf(tableDesc, jobConf);
+ total += estimator.estimate(jobConf, scanOp, -1).getTotalLength();
+ }
+ return new InputSummary(path, new ContentSummary(total, -1, -1));
+ }
+ FileSystem fs = path.getFileSystem(conf);
+ if (fs.exists(path)) {
+ ContentSummary summary = fs.getContentSummary(path);
+ if (summary.getLength() != 0 || summary.getFileCount() != 0 || summary.getDirectoryCount() > 1) {
+ return new InputSummary(path, summary);
+ }
+ }
+ return new InputSummary(path);
+ }
+ // native tables
+ InputSummary summary = new InputSummary();
+ for (FileStatus inputStatus : getInputStatus(path, conf)) {
+ String relative = relativize(path, inputStatus.getPath());
+ Object evaluated = evaluator.evaluate(new String[]{relative});
+ if (EVAL_OI.get(evaluated)) {
+ summary.add(new Path(path, relative), inputStatus);
+ }
+ }
+ if (summary.paths.isEmpty()) {
+ // not existing directory
+ summary.paths.add(path);
+ }
+ return summary;
+ }
+
+ private String relativize(Path inputPath, Path childPath) {
+ URI relativized = inputPath.toUri().relativize(childPath.toUri());
+ if (relativized == childPath.toUri()) {
+ throw new IllegalStateException("Invalid child path " + childPath + " for input " + inputPath);
+ }
+ return relativized.getPath();
+ }
+
+ // todo exploit path filter
+ private FileStatus[] getInputStatus(Path path, Configuration conf) throws IOException {
+ FileSystem fs = path.getFileSystem(conf);
+ FileStatus status;
+ try {
+ status = fs.getFileStatus(path);
+ } catch (FileNotFoundException e) {
+ return new FileStatus[0];
+ }
+ if (status.isDir()) {
+ return fs.globStatus(new Path(path, "/*"));
+ }
+ return new FileStatus[]{status};
+ }
+
+ private Map getPathToEvals() throws HiveException {
+ if (pathToEvals == null) {
+ pathToEvals = new HashMap();
+ }
+ for (Map.Entry> entry : pathToAliases.entrySet()) {
+ pathToEvals.put(entry.getKey(), toFilter(getWorkForAliases(entry.getValue())));
+ }
+ return pathToEvals;
+ }
+
+ private List> getWorkForAliases(List aliases) {
+ List> operators = new ArrayList>();
+ for (String alias : aliases) {
+ Operator extends OperatorDesc> work = aliasToWork.get(alias);
+ if (work == null) {
+ throw new IllegalStateException("Invalid alias " + alias);
+ }
+ operators.add(work);
+ }
+ return operators;
+ }
+
+ private static class InputSummary {
+
+ private boolean exists = false;
+
+ private long length;
+ private long fileCount;
+ private long directoryCount;
+ private List paths;
+
+ public InputSummary() {
+ paths = new ArrayList();
+ }
+
+ public InputSummary(Path path) {
+ paths = Arrays.asList(path);
+ directoryCount = 1;
+ }
+
+ public InputSummary(Path path, ContentSummary content) {
+ this.paths = Arrays.asList(path);
+ this.length = content.getLength();
+ this.fileCount = content.getFileCount();
+ this.directoryCount = content.getDirectoryCount();
+ this.exists = true;
+ }
+
+ public void add(Path path, FileStatus status) {
+ paths.add(path);
+ length += status.getLen();
+ if (!status.isDir()) {
+ fileCount++;
+ } else {
+ directoryCount++;
+ }
+ exists = true;
+ }
+
+ public ContentSummary toContentSummary() {
+ return new ContentSummary(length, fileCount, directoryCount);
+ }
+ }
}
diff --git ql/src/java/org/apache/hadoop/hive/ql/plan/MapredWork.java ql/src/java/org/apache/hadoop/hive/ql/plan/MapredWork.java
index f3203bf..800ddca 100644
--- ql/src/java/org/apache/hadoop/hive/ql/plan/MapredWork.java
+++ ql/src/java/org/apache/hadoop/hive/ql/plan/MapredWork.java
@@ -18,10 +18,15 @@
package org.apache.hadoop.hive.ql.plan;
+import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.ContentSummary;
+import org.apache.hadoop.hive.ql.Context;
import org.apache.hadoop.hive.ql.exec.Operator;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.mapred.JobConf;
@@ -80,4 +85,9 @@ public void configureJobConf(JobConf job) {
return ops;
}
+
+ public ContentSummary getTotalSummary(Context ctx, Configuration conf)
+ throws IOException, HiveException {
+ return mapWork.getTotalSummary(ctx, conf);
+ }
}
diff --git ql/src/java/org/apache/hadoop/hive/ql/plan/TableScanDesc.java ql/src/java/org/apache/hadoop/hive/ql/plan/TableScanDesc.java
index 699b476..e7e504d 100644
--- ql/src/java/org/apache/hadoop/hive/ql/plan/TableScanDesc.java
+++ ql/src/java/org/apache/hadoop/hive/ql/plan/TableScanDesc.java
@@ -67,6 +67,7 @@
private int maxStatsKeyPrefixLength = -1;
private ExprNodeGenericFuncDesc filterExpr;
+ private ExprNodeGenericFuncDesc fileFilterExpr;
private transient Serializable filterObject;
// Both neededColumnIDs and neededColumns should never be null.
@@ -164,6 +165,15 @@ public void setReferencedColumns(List referencedColumns) {
return referencedColumns;
}
+ @Explain(displayName = "fileFilterExpr")
+ public ExprNodeGenericFuncDesc getFileFilterExpr() {
+ return fileFilterExpr;
+ }
+
+ public void setFileFilterExpr(ExprNodeGenericFuncDesc fileFilterExpr) {
+ this.fileFilterExpr = fileFilterExpr;
+ }
+
public void setAlias(String alias) {
this.alias = alias;
}
diff --git ql/src/java/org/apache/hadoop/hive/ql/ppd/OpProcFactory.java ql/src/java/org/apache/hadoop/hive/ql/ppd/OpProcFactory.java
index 7aaf455..968732b 100644
--- ql/src/java/org/apache/hadoop/hive/ql/ppd/OpProcFactory.java
+++ ql/src/java/org/apache/hadoop/hive/ql/ppd/OpProcFactory.java
@@ -44,6 +44,7 @@
import org.apache.hadoop.hive.ql.lib.Node;
import org.apache.hadoop.hive.ql.lib.NodeProcessor;
import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx;
+import org.apache.hadoop.hive.ql.metadata.FilePruningPredicateHandler;
import org.apache.hadoop.hive.ql.metadata.HiveStorageHandler;
import org.apache.hadoop.hive.ql.metadata.HiveStoragePredicateHandler;
import org.apache.hadoop.hive.ql.metadata.Table;
@@ -780,7 +781,7 @@ protected static Object createFilter(Operator op,
* by Hive as a post-filter, or null if it was possible
* to push down the entire predicate
*/
- private static ExprNodeGenericFuncDesc pushFilterToStorageHandler(
+ private static ExprNodeDesc pushFilterToStorageHandler(
TableScanOperator tableScanOp,
ExprNodeGenericFuncDesc originalPredicate,
OpWalkerInfo owi,
@@ -794,11 +795,19 @@ private static ExprNodeGenericFuncDesc pushFilterToStorageHandler(
// optimizations are applied
tableScanDesc.setFilterExpr(originalPredicate);
}
- if (!tbl.isNonNative()) {
+
+ HiveStoragePredicateHandler predicateHandler = null;
+ if (tbl.isNonNative()) {
+ HiveStorageHandler storageHandler = tbl.getStorageHandler();
+ if (storageHandler instanceof HiveStoragePredicateHandler) {
+ predicateHandler = (HiveStoragePredicateHandler) storageHandler;
+ }
+ } else if (HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVEPPDFILES)) {
+ predicateHandler = new FilePruningPredicateHandler();
+ } else {
return originalPredicate;
}
- HiveStorageHandler storageHandler = tbl.getStorageHandler();
- if (!(storageHandler instanceof HiveStoragePredicateHandler)) {
+ if (predicateHandler == null) {
// The storage handler does not provide predicate decomposition
// support, so we'll implement the entire filter in Hive. However,
// we still provide the full predicate to the storage handler in
@@ -806,8 +815,7 @@ private static ExprNodeGenericFuncDesc pushFilterToStorageHandler(
tableScanDesc.setFilterExpr(originalPredicate);
return originalPredicate;
}
- HiveStoragePredicateHandler predicateHandler =
- (HiveStoragePredicateHandler) storageHandler;
+
JobConf jobConf = new JobConf(owi.getParseContext().getConf());
Utilities.setColumnNameList(jobConf, tableScanOp);
Utilities.setColumnTypeList(jobConf, tableScanOp);
@@ -842,10 +850,17 @@ private static ExprNodeGenericFuncDesc pushFilterToStorageHandler(
+ decomposed.residualPredicate.getExprString());
}
}
- tableScanDesc.setFilterExpr(decomposed.pushedPredicate);
- tableScanDesc.setFilterObject(decomposed.pushedPredicateObject);
- return (ExprNodeGenericFuncDesc)decomposed.residualPredicate;
+ if (decomposed.pushedPredicate != null) {
+ ExprNodeGenericFuncDesc predicate =
+ ExprNodeDescUtils.toPredicate(decomposed.pushedPredicate);
+ if (predicateHandler instanceof FilePruningPredicateHandler) {
+ tableScanDesc.setFileFilterExpr(predicate);
+ } else {
+ tableScanDesc.setFilterExpr(predicate);
+ }
+ }
+ return decomposed.residualPredicate;
}
public static NodeProcessor getFilterProc() {
diff --git ql/src/test/org/apache/hadoop/hive/ql/io/TestSymlinkTextInputFormat.java ql/src/test/org/apache/hadoop/hive/ql/io/TestSymlinkTextInputFormat.java
index 41243fe..d23146f 100644
--- ql/src/test/org/apache/hadoop/hive/ql/io/TestSymlinkTextInputFormat.java
+++ ql/src/test/org/apache/hadoop/hive/ql/io/TestSymlinkTextInputFormat.java
@@ -37,7 +37,6 @@
import org.apache.hadoop.hive.ql.Driver;
import org.apache.hadoop.hive.ql.QueryPlan;
import org.apache.hadoop.hive.ql.exec.Utilities;
-import org.apache.hadoop.hive.ql.exec.mr.ExecDriver;
import org.apache.hadoop.hive.ql.exec.mr.MapRedTask;
import org.apache.hadoop.hive.ql.plan.MapredWork;
import org.apache.hadoop.hive.ql.plan.PartitionDesc;
@@ -177,7 +176,7 @@ public void testCombine() throws Exception {
QueryPlan plan = drv.getPlan();
MapRedTask selectTask = (MapRedTask)plan.getRootTasks().get(0);
- List inputPaths = Utilities.getInputPaths(newJob, selectTask.getWork().getMapWork(), emptyScratchDir, ctx);
+ List inputPaths = Utilities.getInputPaths(newJob, selectTask.getWork().getMapWork(), emptyScratchDir);
Utilities.setInputPaths(newJob, inputPaths);
Utilities.setMapRedWork(newJob, selectTask.getWork(), ctx.getMRTmpPath());
diff --git ql/src/test/queries/clientpositive/file_pruning.q ql/src/test/queries/clientpositive/file_pruning.q
new file mode 100644
index 0000000..8231c0d
--- /dev/null
+++ ql/src/test/queries/clientpositive/file_pruning.q
@@ -0,0 +1,17 @@
+-- SORT_QUERY_RESULTS
+
+create temporary function pathname as 'org.apache.hadoop.hive.ql.udf.generic.GenericUDFPathName';
+
+set hive.optimize.ppd.vc.filename=true;
+set hive.auto.convert.join=false;
+
+-- HIVE-1662 File Prunning by filter on INPUT__FILE__NAME(VC)
+-- srcbucket2 has 4 files (srcbucket20, srcbucket21, srcbucket22, srcbucket23)
+-- and selects srcbucket20 and srcbucket23 as input
+
+explain extended
+select * from (select key, value, pathname(INPUT__FILE__NAME) as filename from srcbucket2) A
+ where filename rlike 'srcbucket2[03].txt' AND key < 100 order by filename, key, value;
+
+select * from (select key, value, pathname(INPUT__FILE__NAME) as filename from srcbucket2) A
+ where filename rlike 'srcbucket2[03].txt' AND key < 100 order by filename, key, value;
diff --git ql/src/test/results/clientpositive/file_pruning.q.out ql/src/test/results/clientpositive/file_pruning.q.out
new file mode 100644
index 0000000..0663456
--- /dev/null
+++ ql/src/test/results/clientpositive/file_pruning.q.out
@@ -0,0 +1,257 @@
+PREHOOK: query: -- SORT_QUERY_RESULTS
+
+create temporary function pathname as 'org.apache.hadoop.hive.ql.udf.generic.GenericUDFPathName'
+PREHOOK: type: CREATEFUNCTION
+PREHOOK: Output: database:default
+POSTHOOK: query: -- SORT_QUERY_RESULTS
+
+create temporary function pathname as 'org.apache.hadoop.hive.ql.udf.generic.GenericUDFPathName'
+POSTHOOK: type: CREATEFUNCTION
+POSTHOOK: Output: database:default
+PREHOOK: query: -- HIVE-1662 File Prunning by filter on INPUT__FILE__NAME(VC)
+-- srcbucket2 has 4 files (srcbucket20, srcbucket21, srcbucket22, srcbucket23)
+-- and selects srcbucket20 and srcbucket23 as input
+
+explain extended
+select * from (select key, value, pathname(INPUT__FILE__NAME) as filename from srcbucket2) A
+ where filename rlike 'srcbucket2[03].txt' AND key < 100 order by filename, key, value
+PREHOOK: type: QUERY
+POSTHOOK: query: -- HIVE-1662 File Prunning by filter on INPUT__FILE__NAME(VC)
+-- srcbucket2 has 4 files (srcbucket20, srcbucket21, srcbucket22, srcbucket23)
+-- and selects srcbucket20 and srcbucket23 as input
+
+explain extended
+select * from (select key, value, pathname(INPUT__FILE__NAME) as filename from srcbucket2) A
+ where filename rlike 'srcbucket2[03].txt' AND key < 100 order by filename, key, value
+POSTHOOK: type: QUERY
+ABSTRACT SYNTAX TREE:
+
+TOK_QUERY
+ TOK_FROM
+ TOK_SUBQUERY
+ TOK_QUERY
+ TOK_FROM
+ TOK_TABREF
+ TOK_TABNAME
+ srcbucket2
+ TOK_INSERT
+ TOK_DESTINATION
+ TOK_DIR
+ TOK_TMP_FILE
+ TOK_SELECT
+ TOK_SELEXPR
+ TOK_TABLE_OR_COL
+ key
+ TOK_SELEXPR
+ TOK_TABLE_OR_COL
+ value
+ TOK_SELEXPR
+ TOK_FUNCTION
+ pathname
+ TOK_TABLE_OR_COL
+ INPUT__FILE__NAME
+ filename
+ A
+ TOK_INSERT
+ TOK_DESTINATION
+ TOK_DIR
+ TOK_TMP_FILE
+ TOK_SELECT
+ TOK_SELEXPR
+ TOK_ALLCOLREF
+ TOK_WHERE
+ AND
+ rlike
+ TOK_TABLE_OR_COL
+ filename
+ 'srcbucket2[03].txt'
+ <
+ TOK_TABLE_OR_COL
+ key
+ 100
+ TOK_ORDERBY
+ TOK_TABSORTCOLNAMEASC
+ TOK_TABLE_OR_COL
+ filename
+ TOK_TABSORTCOLNAMEASC
+ TOK_TABLE_OR_COL
+ key
+ TOK_TABSORTCOLNAMEASC
+ TOK_TABLE_OR_COL
+ value
+
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Map Operator Tree:
+ TableScan
+ alias: srcbucket2
+ Statistics: Num rows: 55 Data size: 5812 Basic stats: COMPLETE Column stats: NONE
+ GatherStats: false
+ Filter Operator
+ isSamplingPred: false
+ predicate: (key < 100) (type: boolean)
+ Statistics: Num rows: 18 Data size: 1902 Basic stats: COMPLETE Column stats: NONE
+ Select Operator
+ expressions: key (type: int), value (type: string), pathname(INPUT__FILE__NAME) (type: string)
+ outputColumnNames: _col0, _col1, _col2
+ Statistics: Num rows: 18 Data size: 1902 Basic stats: COMPLETE Column stats: NONE
+ Filter Operator
+ isSamplingPred: false
+ predicate: (_col2 rlike 'srcbucket2[03].txt') (type: boolean)
+ Statistics: Num rows: 9 Data size: 951 Basic stats: COMPLETE Column stats: NONE
+ Select Operator
+ expressions: _col0 (type: int), _col1 (type: string), _col2 (type: string)
+ outputColumnNames: _col0, _col1, _col2
+ Statistics: Num rows: 9 Data size: 951 Basic stats: COMPLETE Column stats: NONE
+ Reduce Output Operator
+ key expressions: _col2 (type: string), _col0 (type: int), _col1 (type: string)
+ sort order: +++
+ Statistics: Num rows: 9 Data size: 951 Basic stats: COMPLETE Column stats: NONE
+ tag: -1
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: srcbucket2
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ COLUMN_STATS_ACCURATE true
+ bucket_count 4
+ bucket_field_name key
+ columns key,value
+ columns.comments
+ columns.types int:string
+#### A masked pattern was here ####
+ name default.srcbucket2
+ numFiles 4
+ numRows 0
+ rawDataSize 0
+ serialization.ddl struct srcbucket2 { i32 key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 5812
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ COLUMN_STATS_ACCURATE true
+ bucket_count 4
+ bucket_field_name key
+ columns key,value
+ columns.comments
+ columns.types int:string
+#### A masked pattern was here ####
+ name default.srcbucket2
+ numFiles 4
+ numRows 0
+ rawDataSize 0
+ serialization.ddl struct srcbucket2 { i32 key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 5812
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.srcbucket2
+ name: default.srcbucket2
+ Truncated Path -> Alias:
+ /srcbucket2 [a:srcbucket2]
+ Needs Tagging: false
+ Reduce Operator Tree:
+ Select Operator
+ expressions: KEY.reducesinkkey1 (type: int), KEY.reducesinkkey2 (type: string), KEY.reducesinkkey0 (type: string)
+ outputColumnNames: _col0, _col1, _col2
+ Statistics: Num rows: 9 Data size: 951 Basic stats: COMPLETE Column stats: NONE
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+ Statistics: Num rows: 9 Data size: 951 Basic stats: COMPLETE Column stats: NONE
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ columns _col0,_col1,_col2
+ columns.types int:string:string
+ escape.delim \
+ hive.serialization.extend.nesting.levels true
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+ Processor Tree:
+ ListSink
+
+PREHOOK: query: select * from (select key, value, pathname(INPUT__FILE__NAME) as filename from srcbucket2) A
+ where filename rlike 'srcbucket2[03].txt' AND key < 100 order by filename, key, value
+PREHOOK: type: QUERY
+PREHOOK: Input: default@srcbucket2
+#### A masked pattern was here ####
+POSTHOOK: query: select * from (select key, value, pathname(INPUT__FILE__NAME) as filename from srcbucket2) A
+ where filename rlike 'srcbucket2[03].txt' AND key < 100 order by filename, key, value
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@srcbucket2
+#### A masked pattern was here ####
+0 val_0 srcbucket20.txt
+0 val_0 srcbucket20.txt
+0 val_0 srcbucket20.txt
+10 val_10 srcbucket23.txt
+11 val_11 srcbucket20.txt
+15 val_15 srcbucket20.txt
+15 val_15 srcbucket20.txt
+18 val_18 srcbucket23.txt
+18 val_18 srcbucket23.txt
+19 val_19 srcbucket20.txt
+26 val_26 srcbucket20.txt
+26 val_26 srcbucket20.txt
+33 val_33 srcbucket20.txt
+37 val_37 srcbucket20.txt
+37 val_37 srcbucket20.txt
+4 val_4 srcbucket20.txt
+43 val_43 srcbucket23.txt
+44 val_44 srcbucket20.txt
+47 val_47 srcbucket23.txt
+51 val_51 srcbucket20.txt
+51 val_51 srcbucket20.txt
+54 val_54 srcbucket23.txt
+58 val_58 srcbucket23.txt
+58 val_58 srcbucket23.txt
+65 val_65 srcbucket23.txt
+66 val_66 srcbucket20.txt
+69 val_69 srcbucket23.txt
+72 val_72 srcbucket23.txt
+72 val_72 srcbucket23.txt
+76 val_76 srcbucket23.txt
+76 val_76 srcbucket23.txt
+77 val_77 srcbucket20.txt
+8 val_8 srcbucket20.txt
+80 val_80 srcbucket20.txt
+83 val_83 srcbucket23.txt
+83 val_83 srcbucket23.txt
+84 val_84 srcbucket20.txt
+84 val_84 srcbucket20.txt
+87 val_87 srcbucket23.txt
+90 val_90 srcbucket23.txt
+90 val_90 srcbucket23.txt
+90 val_90 srcbucket23.txt
+95 val_95 srcbucket20.txt
+95 val_95 srcbucket20.txt
+98 val_98 srcbucket23.txt
+98 val_98 srcbucket23.txt
diff --git ql/src/test/results/clientpositive/tez/file_pruning.q.out ql/src/test/results/clientpositive/tez/file_pruning.q.out
new file mode 100644
index 0000000..b352c0a
--- /dev/null
+++ ql/src/test/results/clientpositive/tez/file_pruning.q.out
@@ -0,0 +1,263 @@
+PREHOOK: query: -- SORT_QUERY_RESULTS
+
+create temporary function pathname as 'org.apache.hadoop.hive.ql.udf.generic.GenericUDFPathName'
+PREHOOK: type: CREATEFUNCTION
+PREHOOK: Output: database:default
+POSTHOOK: query: -- SORT_QUERY_RESULTS
+
+create temporary function pathname as 'org.apache.hadoop.hive.ql.udf.generic.GenericUDFPathName'
+POSTHOOK: type: CREATEFUNCTION
+POSTHOOK: Output: database:default
+PREHOOK: query: -- HIVE-1662 File Prunning by filter on INPUT__FILE__NAME(VC)
+-- srcbucket2 has 4 files (srcbucket20, srcbucket21, srcbucket22, srcbucket23)
+-- and selects srcbucket20 and srcbucket23 as input
+
+explain extended
+select * from (select key, value, pathname(INPUT__FILE__NAME) as filename from srcbucket2) A
+ where filename rlike 'srcbucket2[03].txt' AND key < 100 order by filename, key, value
+PREHOOK: type: QUERY
+POSTHOOK: query: -- HIVE-1662 File Prunning by filter on INPUT__FILE__NAME(VC)
+-- srcbucket2 has 4 files (srcbucket20, srcbucket21, srcbucket22, srcbucket23)
+-- and selects srcbucket20 and srcbucket23 as input
+
+explain extended
+select * from (select key, value, pathname(INPUT__FILE__NAME) as filename from srcbucket2) A
+ where filename rlike 'srcbucket2[03].txt' AND key < 100 order by filename, key, value
+POSTHOOK: type: QUERY
+ABSTRACT SYNTAX TREE:
+
+TOK_QUERY
+ TOK_FROM
+ TOK_SUBQUERY
+ TOK_QUERY
+ TOK_FROM
+ TOK_TABREF
+ TOK_TABNAME
+ srcbucket2
+ TOK_INSERT
+ TOK_DESTINATION
+ TOK_DIR
+ TOK_TMP_FILE
+ TOK_SELECT
+ TOK_SELEXPR
+ TOK_TABLE_OR_COL
+ key
+ TOK_SELEXPR
+ TOK_TABLE_OR_COL
+ value
+ TOK_SELEXPR
+ TOK_FUNCTION
+ pathname
+ TOK_TABLE_OR_COL
+ INPUT__FILE__NAME
+ filename
+ A
+ TOK_INSERT
+ TOK_DESTINATION
+ TOK_DIR
+ TOK_TMP_FILE
+ TOK_SELECT
+ TOK_SELEXPR
+ TOK_ALLCOLREF
+ TOK_WHERE
+ AND
+ rlike
+ TOK_TABLE_OR_COL
+ filename
+ 'srcbucket2[03].txt'
+ <
+ TOK_TABLE_OR_COL
+ key
+ 100
+ TOK_ORDERBY
+ TOK_TABSORTCOLNAMEASC
+ TOK_TABLE_OR_COL
+ filename
+ TOK_TABSORTCOLNAMEASC
+ TOK_TABLE_OR_COL
+ key
+ TOK_TABSORTCOLNAMEASC
+ TOK_TABLE_OR_COL
+ value
+
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+ Stage: Stage-1
+ Tez
+ Edges:
+ Reducer 2 <- Map 1 (SIMPLE_EDGE)
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
+ Map Operator Tree:
+ TableScan
+ alias: srcbucket2
+ Statistics: Num rows: 55 Data size: 5812 Basic stats: COMPLETE Column stats: NONE
+ GatherStats: false
+ Filter Operator
+ isSamplingPred: false
+ predicate: (key < 100) (type: boolean)
+ Statistics: Num rows: 18 Data size: 1902 Basic stats: COMPLETE Column stats: NONE
+ Select Operator
+ expressions: key (type: int), value (type: string), pathname(INPUT__FILE__NAME) (type: string)
+ outputColumnNames: _col0, _col1, _col2
+ Statistics: Num rows: 18 Data size: 1902 Basic stats: COMPLETE Column stats: NONE
+ Filter Operator
+ isSamplingPred: false
+ predicate: (_col2 rlike 'srcbucket2[03].txt') (type: boolean)
+ Statistics: Num rows: 9 Data size: 951 Basic stats: COMPLETE Column stats: NONE
+ Select Operator
+ expressions: _col0 (type: int), _col1 (type: string), _col2 (type: string)
+ outputColumnNames: _col0, _col1, _col2
+ Statistics: Num rows: 9 Data size: 951 Basic stats: COMPLETE Column stats: NONE
+ Reduce Output Operator
+ key expressions: _col2 (type: string), _col0 (type: int), _col1 (type: string)
+ sort order: +++
+ Statistics: Num rows: 9 Data size: 951 Basic stats: COMPLETE Column stats: NONE
+ tag: -1
+ Path -> Alias:
+#### A masked pattern was here ####
+ Path -> Partition:
+#### A masked pattern was here ####
+ Partition
+ base file name: srcbucket2
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ COLUMN_STATS_ACCURATE true
+ bucket_count 4
+ bucket_field_name key
+ columns key,value
+ columns.comments
+ columns.types int:string
+#### A masked pattern was here ####
+ name default.srcbucket2
+ numFiles 4
+ numRows 0
+ rawDataSize 0
+ serialization.ddl struct srcbucket2 { i32 key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 5812
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ COLUMN_STATS_ACCURATE true
+ bucket_count 4
+ bucket_field_name key
+ columns key,value
+ columns.comments
+ columns.types int:string
+#### A masked pattern was here ####
+ name default.srcbucket2
+ numFiles 4
+ numRows 0
+ rawDataSize 0
+ serialization.ddl struct srcbucket2 { i32 key, string value}
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ totalSize 5812
+#### A masked pattern was here ####
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.srcbucket2
+ name: default.srcbucket2
+ Truncated Path -> Alias:
+ /srcbucket2 [srcbucket2]
+ Reducer 2
+ Needs Tagging: false
+ Reduce Operator Tree:
+ Select Operator
+ expressions: KEY.reducesinkkey1 (type: int), KEY.reducesinkkey2 (type: string), KEY.reducesinkkey0 (type: string)
+ outputColumnNames: _col0, _col1, _col2
+ Statistics: Num rows: 9 Data size: 951 Basic stats: COMPLETE Column stats: NONE
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+#### A masked pattern was here ####
+ NumFilesPerFileSink: 1
+ Statistics: Num rows: 9 Data size: 951 Basic stats: COMPLETE Column stats: NONE
+#### A masked pattern was here ####
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ columns _col0,_col1,_col2
+ columns.types int:string:string
+ escape.delim \
+ hive.serialization.extend.nesting.levels true
+ serialization.format 1
+ serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+ Processor Tree:
+ ListSink
+
+PREHOOK: query: select * from (select key, value, pathname(INPUT__FILE__NAME) as filename from srcbucket2) A
+ where filename rlike 'srcbucket2[03].txt' AND key < 100 order by filename, key, value
+PREHOOK: type: QUERY
+PREHOOK: Input: default@srcbucket2
+#### A masked pattern was here ####
+POSTHOOK: query: select * from (select key, value, pathname(INPUT__FILE__NAME) as filename from srcbucket2) A
+ where filename rlike 'srcbucket2[03].txt' AND key < 100 order by filename, key, value
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@srcbucket2
+#### A masked pattern was here ####
+0 val_0 srcbucket20.txt
+0 val_0 srcbucket20.txt
+0 val_0 srcbucket20.txt
+10 val_10 srcbucket23.txt
+11 val_11 srcbucket20.txt
+15 val_15 srcbucket20.txt
+15 val_15 srcbucket20.txt
+18 val_18 srcbucket23.txt
+18 val_18 srcbucket23.txt
+19 val_19 srcbucket20.txt
+26 val_26 srcbucket20.txt
+26 val_26 srcbucket20.txt
+33 val_33 srcbucket20.txt
+37 val_37 srcbucket20.txt
+37 val_37 srcbucket20.txt
+4 val_4 srcbucket20.txt
+43 val_43 srcbucket23.txt
+44 val_44 srcbucket20.txt
+47 val_47 srcbucket23.txt
+51 val_51 srcbucket20.txt
+51 val_51 srcbucket20.txt
+54 val_54 srcbucket23.txt
+58 val_58 srcbucket23.txt
+58 val_58 srcbucket23.txt
+65 val_65 srcbucket23.txt
+66 val_66 srcbucket20.txt
+69 val_69 srcbucket23.txt
+72 val_72 srcbucket23.txt
+72 val_72 srcbucket23.txt
+76 val_76 srcbucket23.txt
+76 val_76 srcbucket23.txt
+77 val_77 srcbucket20.txt
+8 val_8 srcbucket20.txt
+80 val_80 srcbucket20.txt
+83 val_83 srcbucket23.txt
+83 val_83 srcbucket23.txt
+84 val_84 srcbucket20.txt
+84 val_84 srcbucket20.txt
+87 val_87 srcbucket23.txt
+90 val_90 srcbucket23.txt
+90 val_90 srcbucket23.txt
+90 val_90 srcbucket23.txt
+95 val_95 srcbucket20.txt
+95 val_95 srcbucket20.txt
+98 val_98 srcbucket23.txt
+98 val_98 srcbucket23.txt