diff --git common/src/java/org/apache/hadoop/hive/conf/HiveConf.java common/src/java/org/apache/hadoop/hive/conf/HiveConf.java index 3bfd539..b3fc88e 100644 --- common/src/java/org/apache/hadoop/hive/conf/HiveConf.java +++ common/src/java/org/apache/hadoop/hive/conf/HiveConf.java @@ -575,6 +575,7 @@ HIVEOPTPPD("hive.optimize.ppd", true), // predicate pushdown HIVEPPDRECOGNIZETRANSITIVITY("hive.ppd.recognizetransivity", true), // predicate pushdown HIVEPPDREMOVEDUPLICATEFILTERS("hive.ppd.remove.duplicatefilters", true), + HIVEPPDFILES("hive.optimize.ppd.vc.filename", false), HIVEMETADATAONLYQUERIES("hive.optimize.metadataonly", true), // push predicates down to storage handlers HIVEOPTPPD_STORAGE("hive.optimize.ppd.storage", true), diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java index 96a78fc..b4a41a4 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java @@ -1335,14 +1335,25 @@ public static Method getMethodInternal(Class udfClass, List mlist, bo * out of array and getting values out of map. */ public static GenericUDF getGenericUDFForIndex() { - return FunctionRegistry.getFunctionInfo("index").getGenericUDF(); + return getGenericUDF("index"); } /** * A shortcut to get the "and" GenericUDF. */ public static GenericUDF getGenericUDFForAnd() { - return FunctionRegistry.getFunctionInfo("and").getGenericUDF(); + return getGenericUDF("and"); + } + + /** + * A shortcut to get the "or" GenericUDF. + */ + public static GenericUDF getGenericUDFForOr() { + return getGenericUDF("or"); + } + + public static GenericUDF getGenericUDF(String name) { + return FunctionRegistry.getFunctionInfo(name).getGenericUDF(); } /** diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java index 7dc3d59..47d0a8a 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java @@ -63,6 +63,7 @@ import java.util.HashSet; import java.util.Iterator; import java.util.LinkedHashMap; +import java.util.LinkedHashSet; import java.util.LinkedList; import java.util.List; import java.util.Map; @@ -96,7 +97,6 @@ import org.apache.hadoop.hive.common.HiveInterruptCallback; import org.apache.hadoop.hive.common.HiveInterruptUtils; import org.apache.hadoop.hive.common.HiveStatsUtils; -import org.apache.hadoop.hive.common.ObjectPair; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.metastore.Warehouse; import org.apache.hadoop.hive.metastore.api.FieldSchema; @@ -2864,15 +2864,20 @@ public static double getHighestSamplePercentage (MapWork work) { int sequenceNumber = 0; Set pathsProcessed = new HashSet(); - List pathsToAdd = new LinkedList(); + Set pathsToAdd = new LinkedHashSet(); + // AliasToWork contains all the aliases - for (String alias : work.getAliasToWork().keySet()) { + Map> aliasToWork = work.getAliasToWork(); + Map> pathToAliases = work.getPathToAliases(); + + for (String alias : aliasToWork.keySet()) { LOG.info("Processing alias " + alias); // The alias may not have any path Path path = null; - for (String file : new LinkedList(work.getPathToAliases().keySet())) { - List aliases = work.getPathToAliases().get(file); + for (Map.Entry> entry : pathToAliases.entrySet()) { + String file = entry.getKey(); + List aliases = entry.getValue(); if (aliases.contains(alias)) { path = new Path(file); @@ -2885,12 +2890,16 @@ public static double getHighestSamplePercentage (MapWork work) { pathsProcessed.add(path); LOG.info("Adding input file " + path); + + List adding; if (isEmptyPath(job, path, ctx)) { - path = createDummyFileForEmptyPartition(path, job, work, - hiveScratchDir, alias, sequenceNumber++); + adding = Arrays.asList(createDummyFileForEmptyPartition(path, job, work, + hiveScratchDir, alias, sequenceNumber++)); + } else { + adding = work.getPathsFor(path, ctx.getConf()); } - pathsToAdd.add(path); + pathsToAdd.addAll(adding); } } @@ -2908,7 +2917,7 @@ public static double getHighestSamplePercentage (MapWork work) { pathsToAdd.add(path); } } - return pathsToAdd; + return new ArrayList(pathsToAdd); } @SuppressWarnings({"rawtypes", "unchecked"}) diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/mr/ExecDriver.java ql/src/java/org/apache/hadoop/hive/ql/exec/mr/ExecDriver.java index 42d764d..7b423a5 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/mr/ExecDriver.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/mr/ExecDriver.java @@ -36,6 +36,7 @@ import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.filecache.DistributedCache; +import org.apache.hadoop.fs.ContentSummary; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; @@ -103,11 +104,13 @@ private static final long serialVersionUID = 1L; private static final String JOBCONF_FILENAME = "jobconf.xml"; - protected transient JobConf job; - public static MemoryMXBean memoryMXBean; + protected static transient final Log LOG = LogFactory.getLog(ExecDriver.class); + protected static MemoryMXBean memoryMXBean; + protected HadoopJobExecHelper jobExecHelper; - protected static transient final Log LOG = LogFactory.getLog(ExecDriver.class); + protected transient JobConf job; + protected transient ContentSummary inputSummary; private RunningJob rj; @@ -120,6 +123,13 @@ public ExecDriver() { this.jobExecHelper = new HadoopJobExecHelper(job, console, this, this); } + public ContentSummary getInputSummary(Context ctx) throws IOException, HiveException { + if (inputSummary == null) { + inputSummary = getWork().getTotalSummary(ctx, job); + } + return inputSummary; + } + @Override public boolean requireLock() { return true; diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/mr/MapRedTask.java ql/src/java/org/apache/hadoop/hive/ql/exec/mr/MapRedTask.java index a7e2253..0d7bd3d 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/mr/MapRedTask.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/mr/MapRedTask.java @@ -39,6 +39,7 @@ import org.apache.hadoop.hive.ql.exec.Operator; import org.apache.hadoop.hive.ql.exec.Utilities; import org.apache.hadoop.hive.ql.exec.Utilities.StreamPrinter; +import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.plan.MapredWork; import org.apache.hadoop.hive.ql.plan.OperatorDesc; import org.apache.hadoop.hive.ql.plan.ReduceWork; @@ -63,7 +64,6 @@ static final String HIVE_CHILD_CLIENT_DEBUG_OPTS = "HIVE_CHILD_CLIENT_DEBUG_OPTS"; static final String[] HIVE_SYS_PROP = {"build.dir", "build.dir.hive", "hive.query.id"}; - private transient ContentSummary inputSummary = null; private transient boolean runningViaChild = false; private transient long totalInputFileSize; @@ -88,15 +88,13 @@ public int execute(DriverContext driverContext) { } // estimate number of reducers - setNumberOfReducers(); + setNumberOfReducers(ctx); // auto-determine local mode if allowed if (!ctx.isLocalOnlyExecutionMode() && conf.getBoolVar(HiveConf.ConfVars.LOCALMODEAUTO)) { - if (inputSummary == null) { - inputSummary = Utilities.getInputSummary(driverContext.getCtx(), work.getMapWork(), null); - } + ContentSummary inputSummary = getInputSummary(ctx); // set the values of totalInputFileSize and totalInputNumFiles, estimating them // if percentage block sampling is being used @@ -373,7 +371,7 @@ public boolean reduceDone() { /** * Set the number of reducers for the mapred work. */ - private void setNumberOfReducers() throws IOException { + private void setNumberOfReducers(Context ctx) throws IOException, HiveException { ReduceWork rWork = work.getReduceWork(); // this is a temporary hack to fix things that are not fixed in the compiler Integer numReducersFromWork = rWork == null ? 0 : rWork.getNumReduceTasks(); @@ -392,9 +390,7 @@ private void setNumberOfReducers() throws IOException { .printInfo("Number of reduce tasks not specified. Defaulting to jobconf value of: " + reducers); } else { - if (inputSummary == null) { - inputSummary = Utilities.getInputSummary(driverContext.getCtx(), work.getMapWork(), null); - } + ContentSummary inputSummary = getInputSummary(ctx); int reducers = Utilities.estimateNumberOfReducers(conf, inputSummary, work.getMapWork(), work.isFinalMapRed()); rWork.setNumReduceTasks(reducers); diff --git ql/src/java/org/apache/hadoop/hive/ql/index/IndexPredicateAnalyzer.java ql/src/java/org/apache/hadoop/hive/ql/index/IndexPredicateAnalyzer.java index e66c22c..e7d5756 100644 --- ql/src/java/org/apache/hadoop/hive/ql/index/IndexPredicateAnalyzer.java +++ ql/src/java/org/apache/hadoop/hive/ql/index/IndexPredicateAnalyzer.java @@ -66,10 +66,21 @@ private Set allowedColumnNames; + private boolean allowAllColumns; + private boolean allowAllFunctions; + public IndexPredicateAnalyzer() { udfNames = new HashSet(); } + public void setAllowAllColumns(boolean allowAllColumns) { + this.allowAllColumns = allowAllColumns; + } + + public void setAllowAllFunctions(boolean allowAllFunctions) { + this.allowAllFunctions = allowAllFunctions; + } + /** * Registers a comparison operator as one which can be satisfied * by an index search. Unless this is called, analyzePredicate @@ -159,7 +170,6 @@ private ExprNodeDesc analyzeExpr( List searchConditions, Object... nodeOutputs) { - expr = (ExprNodeGenericFuncDesc) expr; if (FunctionRegistry.isOpAnd(expr)) { assert(nodeOutputs.length == 2); ExprNodeDesc residual1 = (ExprNodeDesc) nodeOutputs[0]; @@ -186,7 +196,7 @@ private ExprNodeDesc analyzeExpr( } else { udfName = expr.getGenericUDF().getClass().getName(); } - if (!udfNames.contains(udfName)) { + if (!allowAllFunctions && !udfNames.contains(udfName)) { return expr; } @@ -209,7 +219,7 @@ private ExprNodeDesc analyzeExpr( return expr; } if (allowedColumnNames != null) { - if (!allowedColumnNames.contains(columnDesc.getColumn())) { + if (!allowAllColumns && !allowedColumnNames.contains(columnDesc.getColumn())) { return expr; } } diff --git ql/src/java/org/apache/hadoop/hive/ql/io/HiveFileFormatUtils.java ql/src/java/org/apache/hadoop/hive/ql/io/HiveFileFormatUtils.java index 4be56f3..93dee9d 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/HiveFileFormatUtils.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/HiveFileFormatUtils.java @@ -19,7 +19,6 @@ package org.apache.hadoop.hive.ql.io; import java.io.IOException; -import java.io.Serializable; import java.util.ArrayList; import java.util.HashMap; import java.util.List; @@ -34,7 +33,6 @@ import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.ql.exec.Operator; import org.apache.hadoop.hive.ql.exec.Utilities; -import org.apache.hadoop.hive.ql.io.HivePassThroughOutputFormat; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.plan.FileSinkDesc; import org.apache.hadoop.hive.ql.plan.OperatorDesc; @@ -189,7 +187,7 @@ public static synchronized void registerInputFormatChecker( * get an InputFormatChecker for a file format. */ public static synchronized Class getInputFormatChecker( - Class inputFormat) { + Class inputFormat) { Class result = inputFormatCheckerMap .get(inputFormat); return result; @@ -428,7 +426,6 @@ private static String getMatchingPath(Map> pathToAlias if (foundAlias(pathToAliases, dirPath)) { return dirPath; } - path = dirPath; String dirStr = dir.toString(); int dirPathIndex = dirPath.lastIndexOf(Path.SEPARATOR); @@ -449,6 +446,19 @@ private static String getMatchingPath(Map> pathToAlias return null; } + public static String getMatchingPath(Map> pathToAliases, String path) { + while (true) { + if (foundAlias(pathToAliases, path)) { + return path; + } + int index = path.lastIndexOf(Path.SEPARATOR); + if (index < 0) { + return null; + } + path = path.substring(0, index); + } + } + /** * Get the list of operators from the operator tree that are needed for the path * @param pathToAliases mapping from path to aliases diff --git ql/src/java/org/apache/hadoop/hive/ql/io/HiveInputFormat.java ql/src/java/org/apache/hadoop/hive/ql/io/HiveInputFormat.java index 99172d4..dcf4387 100755 --- ql/src/java/org/apache/hadoop/hive/ql/io/HiveInputFormat.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/HiveInputFormat.java @@ -312,7 +312,8 @@ private void addSplitsForGroup(List dirs, TableScanOperator tableScan, Job // for each dir, get the InputFormat, and do getSplits. for (Path dir : dirs) { - PartitionDesc part = getPartitionDescFromPath(pathToPartitionInfo, dir); + PartitionDesc part = HiveFileFormatUtils. + getPartitionDescFromPathRecursively(pathToPartitionInfo, dir, null); Class inputFormatClass = part.getInputFileFormatClass(); TableDesc table = part.getTableDesc(); TableScanOperator tableScan = null; diff --git ql/src/java/org/apache/hadoop/hive/ql/metadata/FilePrunningPredicateHandler.java ql/src/java/org/apache/hadoop/hive/ql/metadata/FilePrunningPredicateHandler.java new file mode 100644 index 0000000..4420856 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/metadata/FilePrunningPredicateHandler.java @@ -0,0 +1,53 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.metadata; + +import java.util.ArrayList; +import java.util.List; + +import org.apache.hadoop.hive.ql.index.IndexPredicateAnalyzer; +import org.apache.hadoop.hive.ql.index.IndexSearchCondition; +import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; +import org.apache.hadoop.hive.serde2.Deserializer; +import org.apache.hadoop.mapred.JobConf; + +/** + * Extracts file pruning expression (used by MapredWork) + */ +public class FilePrunningPredicateHandler implements HiveStoragePredicateHandler { + + @Override + public DecomposedPredicate decomposePredicate(JobConf jobConf, Deserializer deserializer, + ExprNodeDesc predicate) { + IndexPredicateAnalyzer analyzer = new IndexPredicateAnalyzer(); + analyzer.allowColumnName(VirtualColumn.FILENAME.getName()); + analyzer.setAllowAllFunctions(true); + + List searchConditions = new ArrayList(); + ExprNodeDesc residual = analyzer.analyzePredicate(predicate, searchConditions); + + // there is no assertion that pushedPredicate to be applied always, so residue all of them. + DecomposedPredicate decomposedPredicate = new DecomposedPredicate(); + decomposedPredicate.pushedPredicate = analyzer.translateSearchConditions(searchConditions); + + // keep all (todo: can be removed when it's referenced by one alias) + decomposedPredicate.residualPredicate = predicate; + return decomposedPredicate; + } +} diff --git ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveStoragePredicateHandler.java ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveStoragePredicateHandler.java index 9f35575..df56aa9 100644 --- ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveStoragePredicateHandler.java +++ ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveStoragePredicateHandler.java @@ -18,10 +18,9 @@ package org.apache.hadoop.hive.ql.metadata; +import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; -import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc; import org.apache.hadoop.hive.serde2.Deserializer; -import org.apache.hadoop.mapred.JobConf; /** * HiveStoragePredicateHandler is an optional companion to {@link @@ -66,12 +65,12 @@ public DecomposedPredicate decomposePredicate( * Portion of predicate to be evaluated by storage handler. Hive * will pass this into the storage handler's input format. */ - public ExprNodeGenericFuncDesc pushedPredicate; + public ExprNodeDesc pushedPredicate; /** * Portion of predicate to be post-evaluated by Hive for any rows * which are returned by storage handler. */ - public ExprNodeGenericFuncDesc residualPredicate; + public ExprNodeDesc residualPredicate; } } diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/AbstractJoinTaskDispatcher.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/AbstractJoinTaskDispatcher.java index 33ef581..177ff2d 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/AbstractJoinTaskDispatcher.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/AbstractJoinTaskDispatcher.java @@ -124,10 +124,6 @@ public long getTotalKnownInputSize(Context context, MapWork currWork, Map> pathToAliases, HashMap aliasToSize) throws SemanticException { try { - // go over all the input paths, and calculate a known total size, known - // size for each input alias. - Utilities.getInputSummary(context, currWork, null).getLength(); - // set alias to size mapping, this can be used to determine if one table // is chosen as big table, what's the total size of left tables, which // are going to be small tables. @@ -135,7 +131,7 @@ public long getTotalKnownInputSize(Context context, MapWork currWork, for (Map.Entry> entry : pathToAliases.entrySet()) { String path = entry.getKey(); List aliasList = entry.getValue(); - ContentSummary cs = context.getCS(path); + ContentSummary cs = currWork.getSummaryFor(path, context.getConf()); if (cs != null) { long size = cs.getLength(); for (String alias : aliasList) { diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/index/IndexWhereProcessor.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/index/IndexWhereProcessor.java index 5c6751c..a665345 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/index/IndexWhereProcessor.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/index/IndexWhereProcessor.java @@ -18,7 +18,6 @@ package org.apache.hadoop.hive.ql.optimizer.physical.index; -import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.LinkedHashSet; @@ -33,7 +32,6 @@ import org.apache.hadoop.hive.metastore.api.Index; import org.apache.hadoop.hive.ql.exec.TableScanOperator; import org.apache.hadoop.hive.ql.exec.Task; -import org.apache.hadoop.hive.ql.exec.Utilities; import org.apache.hadoop.hive.ql.exec.mr.MapRedTask; import org.apache.hadoop.hive.ql.hooks.ReadEntity; import org.apache.hadoop.hive.ql.index.HiveIndexHandler; @@ -204,14 +202,15 @@ private void rewriteForIndexes(ExprNodeDesc predicate, List indexes, } // check the size + MapredWork work = task.getWork(); try { - ContentSummary inputSummary = Utilities.getInputSummary(pctx.getContext(), task.getWork().getMapWork(), null); + ContentSummary inputSummary = work.getTotalSummary(pctx.getContext(), pctx.getConf()); long inputSize = inputSummary.getLength(); if (!indexHandler.checkQuerySize(inputSize, pctx.getConf())) { queryContext.setQueryTasks(null); return; } - } catch (IOException e) { + } catch (Exception e) { throw new SemanticException("Failed to get task size", e); } diff --git ql/src/java/org/apache/hadoop/hive/ql/parse/MapReduceCompiler.java ql/src/java/org/apache/hadoop/hive/ql/parse/MapReduceCompiler.java index 76f5a31..bd59711 100644 --- ql/src/java/org/apache/hadoop/hive/ql/parse/MapReduceCompiler.java +++ ql/src/java/org/apache/hadoop/hive/ql/parse/MapReduceCompiler.java @@ -18,7 +18,6 @@ package org.apache.hadoop.hive.ql.parse; -import java.io.IOException; import java.io.Serializable; import java.util.ArrayList; import java.util.HashMap; @@ -222,9 +221,9 @@ public boolean accept(Path file) { boolean hasNonLocalJob = false; for (ExecDriver mrtask : mrtasks) { try { - ContentSummary inputSummary = Utilities.getInputSummary - (ctx, ((MapredWork) mrtask.getWork()).getMapWork(), p); - int numReducers = getNumberOfReducers(mrtask.getWork(), conf); + MapredWork work = mrtask.getWork(); + ContentSummary inputSummary = work.getTotalSummary(lCtx, ctx.getConf()); + int numReducers = getNumberOfReducers(work, conf); long estimatedInput; @@ -257,7 +256,7 @@ public boolean accept(Path file) { } else { mrtask.setLocalMode(true); } - } catch (IOException e) { + } catch (Exception e) { throw new SemanticException(e); } } diff --git ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeDescUtils.java ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeDescUtils.java index 96c8d89..4d754ba 100644 --- ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeDescUtils.java +++ ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeDescUtils.java @@ -19,12 +19,14 @@ package org.apache.hadoop.hive.ql.plan; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; import java.util.Map; import org.apache.hadoop.hive.ql.exec.FunctionRegistry; import org.apache.hadoop.hive.ql.exec.Operator; import org.apache.hadoop.hive.ql.parse.SemanticException; +import org.apache.hadoop.hive.serde.serdeConstants; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; public class ExprNodeDescUtils { @@ -91,10 +93,30 @@ public static boolean containsPredicate(ExprNodeDesc source, ExprNodeDesc predic return false; } + public static ExprNodeGenericFuncDesc toPredicate(ExprNodeDesc predicate) { + if (predicate instanceof ExprNodeGenericFuncDesc) { + return (ExprNodeGenericFuncDesc) predicate; + } + return new ExprNodeGenericFuncDesc(TypeInfoFactory.booleanTypeInfo, + FunctionRegistry.getGenericUDF(serdeConstants.BOOLEAN_TYPE_NAME), + new ArrayList(Arrays.asList(predicate))); + } + + /** + * bind two predicates by OR op + */ + public static ExprNodeDesc orPredicates(ExprNodeDesc prev, ExprNodeDesc next) { + List children = new ArrayList(2); + children.add(prev); + children.add(next); + return new ExprNodeGenericFuncDesc(TypeInfoFactory.booleanTypeInfo, + FunctionRegistry.getGenericUDFForOr(), children); + } + /** * bind two predicates by AND op */ - public static ExprNodeGenericFuncDesc mergePredicates(ExprNodeDesc prev, ExprNodeDesc next) { + public static ExprNodeDesc andPredicates(ExprNodeDesc prev, ExprNodeDesc next) { List children = new ArrayList(2); children.add(prev); children.add(next); @@ -112,7 +134,7 @@ public static ExprNodeDesc mergePredicates(List exprs) { prev = expr; continue; } - prev = mergePredicates(prev, expr); + prev = andPredicates(prev, expr); } return prev; } diff --git ql/src/java/org/apache/hadoop/hive/ql/plan/MapWork.java ql/src/java/org/apache/hadoop/hive/ql/plan/MapWork.java index 9929275..6a76009 100644 --- ql/src/java/org/apache/hadoop/hive/ql/plan/MapWork.java +++ ql/src/java/org/apache/hadoop/hive/ql/plan/MapWork.java @@ -18,6 +18,7 @@ package org.apache.hadoop.hive.ql.plan; +import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; @@ -30,16 +31,34 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.ContentSummary; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.ql.Context; +import org.apache.hadoop.hive.ql.exec.ExprNodeEvaluator; +import org.apache.hadoop.hive.ql.exec.ExprNodeEvaluatorFactory; import org.apache.hadoop.hive.ql.exec.FileSinkOperator; import org.apache.hadoop.hive.ql.exec.Operator; import org.apache.hadoop.hive.ql.exec.OperatorUtils; +import org.apache.hadoop.hive.ql.exec.TableScanOperator; +import org.apache.hadoop.hive.ql.io.ContentSummaryInputFormat; +import org.apache.hadoop.hive.ql.io.HiveFileFormatUtils; +import org.apache.hadoop.hive.ql.io.HiveInputFormat; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.metadata.VirtualColumn; import org.apache.hadoop.hive.ql.optimizer.physical.BucketingSortingCtx.BucketCol; import org.apache.hadoop.hive.ql.optimizer.physical.BucketingSortingCtx.SortCol; import org.apache.hadoop.hive.ql.parse.OpParseContext; import org.apache.hadoop.hive.ql.parse.QBJoinTree; import org.apache.hadoop.hive.ql.parse.SplitSample; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.BooleanObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; +import org.apache.hadoop.mapred.InputFormat; import org.apache.hadoop.mapred.JobConf; /** @@ -513,4 +532,198 @@ public void setVectorMode(boolean vectorMode) { this.vectorMode = vectorMode; } + private static final ObjectInspector VC_FILE_OI = + ObjectInspectorFactory.getStandardStructObjectInspector( + Arrays.asList(VirtualColumn.FILENAME.getName()), + Arrays.asList((ObjectInspector) PrimitiveObjectInspectorFactory.javaStringObjectInspector)); + + private static final BooleanObjectInspector EVAL_OI = + PrimitiveObjectInspectorFactory.writableBooleanObjectInspector; + + private transient Map pathToEvals; + private transient Map summaries; + + public ContentSummary getSummaryFor(String path, Configuration conf) + throws IOException, HiveException { + return getInputSummaryFor(path, conf).toContentSummary(); + } + + // if file pruning is applied, return file paths passed the filter. if not, return input path + public List getPathsFor(String path, Configuration conf) + throws IOException, HiveException { + return getInputSummaryFor(path, conf).paths; + } + + public List getPathsFor(Path path, Configuration conf) + throws IOException, HiveException { + List paths = new ArrayList(); + for (String apath : getPathsFor(path.toString(), conf)) { + paths.add(new Path(apath)); + } + return paths; + } + + private InputSummary getInputSummaryFor(String path, Configuration conf) + throws IOException, HiveException { + path = HiveFileFormatUtils.getMatchingPath(pathToAliases, path); + if (path == null) { + throw new HiveException("Invalid path " + path); + } + InputSummary summary = summaries == null ? null : summaries.get(path); + if (summary == null) { + ExprNodeEvaluator evaluator = getPathToEvals().get(path); + if (summaries == null) { + summaries = new HashMap(); + } + summaries.put(path, summary = summarize(path, conf, evaluator)); + } + return summary; + } + + public ContentSummary getTotalSummary(Context ctx, Configuration conf) + throws IOException, HiveException { + long[] summary = new long[3]; + for (String path : pathToAliases.keySet()) { + ContentSummary pathSummary = getSummaryFor(path, conf); + if (pathSummary != null) { + summary[0] += pathSummary.getLength(); + summary[1] += pathSummary.getFileCount(); + summary[2] += pathSummary.getDirectoryCount(); + ctx.addCS(path, pathSummary); + } + } + return new ContentSummary(summary[0], summary[1], summary[2]); + } + + private ExprNodeEvaluator toFilter(List> operators) throws HiveException { + ExprNodeDesc prev = null; + for (Operator operator : operators) { + if (operator instanceof TableScanOperator && operator.getConf() != null) { + ExprNodeDesc filterExpr = ((TableScanOperator) operator).getConf().getFileFilterExpr(); + if (filterExpr == null) { + continue; + } + if (prev == null) { + prev = filterExpr; + } else { + prev = ExprNodeDescUtils.orPredicates(prev, filterExpr); + } + } + } + if (prev == null) { + return null; + } + ExprNodeEvaluator evaluator = ExprNodeEvaluatorFactory.get(prev); + evaluator.initialize(VC_FILE_OI); + return evaluator; + } + + private InputSummary summarize(String path, Configuration conf, ExprNodeEvaluator evaluator) + throws IOException, HiveException { + Path apath = new Path(path); + Class format = pathToPartitionInfo.get(path).getInputFileFormatClass(); + if (ContentSummaryInputFormat.class.isAssignableFrom(format)) { + JobConf jobConf = new JobConf(conf); + ContentSummaryInputFormat summaryInput = (ContentSummaryInputFormat) + HiveInputFormat.getInputFormatFromCache(format, jobConf); + ContentSummary content = summaryInput.getContentSummary(apath, jobConf); + return new InputSummary(path, content); + } + if (evaluator == null) { + FileSystem fs = apath.getFileSystem(conf); + if (fs.exists(apath)) { + return new InputSummary(path, fs.getContentSummary(apath)); + } + return new InputSummary(path); + } + InputSummary summary = new InputSummary(); + for (FileStatus inputStatus : getInputStatus(apath, conf)) { + String input = normalize(path, inputStatus.getPath().toString()); + Object evaluated = evaluator.evaluate(new String[]{input}); + if (EVAL_OI.get(evaluated)) { + summary.add(input, inputStatus); + } + } + if (summary.paths.isEmpty()) { + // not existing directory + summary.paths.add(path); + } + return summary; + } + + private String normalize(String path, String inputPath) { + int index = inputPath.lastIndexOf(path); + return path + inputPath.substring(index + path.length()); + } + + private FileStatus[] getInputStatus(Path path, Configuration conf) throws IOException { + FileSystem fs = path.getFileSystem(conf); + if (!fs.exists(path)) { + return new FileStatus[0]; + } + FileStatus status = fs.getFileStatus(path); + if (!status.isDir()) { + return new FileStatus[] {status}; + } + return fs.listStatus(path); + } + + private Map getPathToEvals() throws HiveException { + if (pathToEvals == null) { + pathToEvals = new HashMap(); + } + for (Map.Entry> entry : pathToAliases.entrySet()) { + pathToEvals.put(entry.getKey(), toFilter(getWorkForAliases(entry.getValue()))); + } + return pathToEvals; + } + + private List> getWorkForAliases(List aliases) { + List> operators = new ArrayList>(); + for (String alias : aliases) { + Operator work = aliasToWork.get(alias); + if (work == null) { + throw new IllegalStateException("Invalid alias " + alias); + } + operators.add(work); + } + return operators; + } + + private static class InputSummary { + + private long[] summary; + private List paths; + + public InputSummary() { + paths = new ArrayList(); + } + + public InputSummary(String path) { + paths = Arrays.asList(path); + } + + public InputSummary(String path, ContentSummary content) { + this.paths = Arrays.asList(path); + this.summary = new long[] + {content.getLength(), content.getFileCount(), content.getDirectoryCount()}; + } + + public void add(String path, FileStatus status) { + if (summary == null) { + summary = new long[3]; + } + paths.add(path); + summary[0] += status.getLen(); + if (!status.isDir()) { + summary[1]++; + } else { + summary[2]++; + } + } + + public ContentSummary toContentSummary() { + return summary == null ? null : new ContentSummary(summary[0], summary[1], summary[2]); + } + } } diff --git ql/src/java/org/apache/hadoop/hive/ql/plan/MapredWork.java ql/src/java/org/apache/hadoop/hive/ql/plan/MapredWork.java index f3203bf..800ddca 100644 --- ql/src/java/org/apache/hadoop/hive/ql/plan/MapredWork.java +++ ql/src/java/org/apache/hadoop/hive/ql/plan/MapredWork.java @@ -18,10 +18,15 @@ package org.apache.hadoop.hive.ql.plan; +import java.io.IOException; import java.util.ArrayList; import java.util.List; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.ContentSummary; +import org.apache.hadoop.hive.ql.Context; import org.apache.hadoop.hive.ql.exec.Operator; +import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.mapred.JobConf; @@ -80,4 +85,9 @@ public void configureJobConf(JobConf job) { return ops; } + + public ContentSummary getTotalSummary(Context ctx, Configuration conf) + throws IOException, HiveException { + return mapWork.getTotalSummary(ctx, conf); + } } diff --git ql/src/java/org/apache/hadoop/hive/ql/plan/PlanUtils.java ql/src/java/org/apache/hadoop/hive/ql/plan/PlanUtils.java index da1437c..3136360 100644 --- ql/src/java/org/apache/hadoop/hive/ql/plan/PlanUtils.java +++ ql/src/java/org/apache/hadoop/hive/ql/plan/PlanUtils.java @@ -63,6 +63,7 @@ import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; import org.apache.hadoop.mapred.InputFormat; import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.OutputFormat; import org.apache.hadoop.mapred.SequenceFileInputFormat; import org.apache.hadoop.mapred.SequenceFileOutputFormat; import org.apache.hadoop.mapred.TextInputFormat; @@ -137,7 +138,8 @@ public static TableDesc getDefaultTableDesc(CreateTableDesc localDirectoryDesc, serdeConstants.SERIALIZATION_LIB, localDirectoryDesc.getSerName()); } if (localDirectoryDesc.getOutputFormat() != null){ - tableDesc.setOutputFileFormatClass(Class.forName(localDirectoryDesc.getOutputFormat())); + tableDesc.setOutputFileFormatClass( + (Class) Class.forName(localDirectoryDesc.getOutputFormat())); } if (localDirectoryDesc.getNullFormat() != null) { tableDesc.getProperties().setProperty(serdeConstants.SERIALIZATION_NULL_FORMAT, diff --git ql/src/java/org/apache/hadoop/hive/ql/plan/TableScanDesc.java ql/src/java/org/apache/hadoop/hive/ql/plan/TableScanDesc.java index 9c35890..9926037 100644 --- ql/src/java/org/apache/hadoop/hive/ql/plan/TableScanDesc.java +++ ql/src/java/org/apache/hadoop/hive/ql/plan/TableScanDesc.java @@ -61,6 +61,7 @@ private int maxStatsKeyPrefixLength = -1; private ExprNodeGenericFuncDesc filterExpr; + private ExprNodeGenericFuncDesc fileFilterExpr; public static final String FILTER_EXPR_CONF_STR = "hive.io.filter.expr.serialized"; @@ -104,6 +105,15 @@ public void setFilterExpr(ExprNodeGenericFuncDesc filterExpr) { this.filterExpr = filterExpr; } + @Explain(displayName = "fileFilterExpr") + public ExprNodeGenericFuncDesc getFileFilterExpr() { + return fileFilterExpr; + } + + public void setFileFilterExpr(ExprNodeGenericFuncDesc fileFilterExpr) { + this.fileFilterExpr = fileFilterExpr; + } + public void setAlias(String alias) { this.alias = alias; } diff --git ql/src/java/org/apache/hadoop/hive/ql/ppd/OpProcFactory.java ql/src/java/org/apache/hadoop/hive/ql/ppd/OpProcFactory.java index 40298e1..8d692a9 100644 --- ql/src/java/org/apache/hadoop/hive/ql/ppd/OpProcFactory.java +++ ql/src/java/org/apache/hadoop/hive/ql/ppd/OpProcFactory.java @@ -43,6 +43,7 @@ import org.apache.hadoop.hive.ql.lib.Node; import org.apache.hadoop.hive.ql.lib.NodeProcessor; import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx; +import org.apache.hadoop.hive.ql.metadata.FilePrunningPredicateHandler; import org.apache.hadoop.hive.ql.metadata.HiveStorageHandler; import org.apache.hadoop.hive.ql.metadata.HiveStoragePredicateHandler; import org.apache.hadoop.hive.ql.metadata.Table; @@ -773,7 +774,7 @@ protected static Object createFilter(Operator op, * by Hive as a post-filter, or null if it was possible * to push down the entire predicate */ - private static ExprNodeGenericFuncDesc pushFilterToStorageHandler( + private static ExprNodeDesc pushFilterToStorageHandler( TableScanOperator tableScanOp, ExprNodeGenericFuncDesc originalPredicate, OpWalkerInfo owi, @@ -787,11 +788,19 @@ private static ExprNodeGenericFuncDesc pushFilterToStorageHandler( // optimizations are applied tableScanDesc.setFilterExpr(originalPredicate); } - if (!tbl.isNonNative()) { + + HiveStoragePredicateHandler predicateHandler = null; + if (tbl.isNonNative()) { + HiveStorageHandler storageHandler = tbl.getStorageHandler(); + if (storageHandler instanceof HiveStoragePredicateHandler) { + predicateHandler = (HiveStoragePredicateHandler) storageHandler; + } + } else if (HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVEPPDFILES)) { + predicateHandler = new FilePrunningPredicateHandler(); + } else { return originalPredicate; } - HiveStorageHandler storageHandler = tbl.getStorageHandler(); - if (!(storageHandler instanceof HiveStoragePredicateHandler)) { + if (predicateHandler == null) { // The storage handler does not provide predicate decomposition // support, so we'll implement the entire filter in Hive. However, // we still provide the full predicate to the storage handler in @@ -799,8 +808,7 @@ private static ExprNodeGenericFuncDesc pushFilterToStorageHandler( tableScanDesc.setFilterExpr(originalPredicate); return originalPredicate; } - HiveStoragePredicateHandler predicateHandler = - (HiveStoragePredicateHandler) storageHandler; + JobConf jobConf = new JobConf(owi.getParseContext().getConf()); Utilities.setColumnNameList(jobConf, tableScanOp); Utilities.setColumnTypeList(jobConf, tableScanOp); @@ -835,8 +843,17 @@ private static ExprNodeGenericFuncDesc pushFilterToStorageHandler( + decomposed.residualPredicate.getExprString()); } } - tableScanDesc.setFilterExpr(decomposed.pushedPredicate); - return (ExprNodeGenericFuncDesc)decomposed.residualPredicate; + + if (decomposed.pushedPredicate != null) { + ExprNodeGenericFuncDesc predicate = + ExprNodeDescUtils.toPredicate(decomposed.pushedPredicate); + if (predicateHandler instanceof FilePrunningPredicateHandler) { + tableScanDesc.setFileFilterExpr(predicate); + } else { + tableScanDesc.setFilterExpr(predicate); + } + } + return decomposed.residualPredicate; } public static NodeProcessor getFilterProc() { diff --git ql/src/test/queries/clientpositive/file_pruning.q ql/src/test/queries/clientpositive/file_pruning.q new file mode 100644 index 0000000..91e50de --- /dev/null +++ ql/src/test/queries/clientpositive/file_pruning.q @@ -0,0 +1,18 @@ +set hive.optimize.ppd.vc.filename=true; +set hive.auto.convert.join=false; + +-- HIVE-1662 File Prunning by filter on INPUT__FILE__NAME(VC) +-- srcbucket2 has 4 files (srcbucket20, srcbucket21, srcbucket22, srcbucket23) +-- and selects srcbucket20 and srcbucket23 as input + +explain extended +select key, value, reflect2(INPUT__FILE__NAME, "replaceAll", ".*/", "") as filename from srcbucket2 where INPUT__FILE__NAME rlike '.*/srcbucket2[03].txt' AND key < 100 order by filename; +select key, value, reflect2(INPUT__FILE__NAME, "replaceAll", ".*/", "") as filename from srcbucket2 where INPUT__FILE__NAME rlike '.*/srcbucket2[03].txt' AND key < 100 order by filename; + +explain extended +select reflect2(a.INPUT__FILE__NAME, "replaceAll", ".*/", ""),a.key,reflect2(b.INPUT__FILE__NAME, "replaceAll", ".*/", ""),b.key from + (select key,INPUT__FILE__NAME from srcbucket2 where INPUT__FILE__NAME rlike '.*/srcbucket2[03].txt' AND key < 100) a left outer join + (select key,INPUT__FILE__NAME from srcbucket2 where INPUT__FILE__NAME rlike '.*/srcbucket2[02].txt' AND key < 100) b on (a.key=b.key+1); +select reflect2(a.INPUT__FILE__NAME, "replaceAll", ".*/", ""),a.key,reflect2(b.INPUT__FILE__NAME, "replaceAll", ".*/", ""),b.key from + (select key,INPUT__FILE__NAME from srcbucket2 where INPUT__FILE__NAME rlike '.*/srcbucket2[03].txt' AND key < 100) a left outer join + (select key,INPUT__FILE__NAME from srcbucket2 where INPUT__FILE__NAME rlike '.*/srcbucket2[02].txt' AND key < 100) b on (a.key=b.key+1); diff --git ql/src/test/results/clientpositive/file_pruning.q.out ql/src/test/results/clientpositive/file_pruning.q.out new file mode 100644 index 0000000..77b8025 --- /dev/null +++ ql/src/test/results/clientpositive/file_pruning.q.out @@ -0,0 +1,467 @@ +PREHOOK: query: -- HIVE-1662 File Prunning by filter on INPUT__FILE__NAME(VC) +-- srcbucket2 has 4 files (srcbucket20, srcbucket21, srcbucket22, srcbucket23) +-- and selects srcbucket20 and srcbucket23 as input + +explain extended +select key, value, reflect2(INPUT__FILE__NAME, "replaceAll", ".*/", "") as filename from srcbucket2 where INPUT__FILE__NAME rlike '.*/srcbucket2[03].txt' AND key < 100 order by filename +PREHOOK: type: QUERY +POSTHOOK: query: -- HIVE-1662 File Prunning by filter on INPUT__FILE__NAME(VC) +-- srcbucket2 has 4 files (srcbucket20, srcbucket21, srcbucket22, srcbucket23) +-- and selects srcbucket20 and srcbucket23 as input + +explain extended +select key, value, reflect2(INPUT__FILE__NAME, "replaceAll", ".*/", "") as filename from srcbucket2 where INPUT__FILE__NAME rlike '.*/srcbucket2[03].txt' AND key < 100 order by filename +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME srcbucket2))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL value)) (TOK_SELEXPR (TOK_FUNCTION reflect2 (TOK_TABLE_OR_COL INPUT__FILE__NAME) "replaceAll" ".*/" "") filename)) (TOK_WHERE (AND (rlike (TOK_TABLE_OR_COL INPUT__FILE__NAME) '.*/srcbucket2[03].txt') (< (TOK_TABLE_OR_COL key) 100))) (TOK_ORDERBY (TOK_TABSORTCOLNAMEASC (TOK_TABLE_OR_COL filename))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + srcbucket2 + TableScan + alias: srcbucket2 + fileFilterExpr: + expr: (INPUT__FILE__NAME rlike '.*/srcbucket2[03].txt') + type: boolean + Statistics: + numRows: 55 dataSize: 5812 basicStatsState: COMPLETE colStatsState: NONE + GatherStats: false + Filter Operator + isSamplingPred: false + predicate: + expr: ((INPUT__FILE__NAME rlike '.*/srcbucket2[03].txt') and (key < 100)) + type: boolean + Statistics: + numRows: 9 dataSize: 951 basicStatsState: COMPLETE colStatsState: NONE + Select Operator + expressions: + expr: key + type: int + expr: value + type: string + expr: reflect2(INPUT__FILE__NAME,'replaceAll','.*/','') + type: string + outputColumnNames: _col0, _col1, _col2 + Statistics: + numRows: 9 dataSize: 951 basicStatsState: COMPLETE colStatsState: NONE + Reduce Output Operator + key expressions: + expr: _col2 + type: string + sort order: + + Statistics: + numRows: 9 dataSize: 951 basicStatsState: COMPLETE colStatsState: NONE + tag: -1 + value expressions: + expr: _col0 + type: int + expr: _col1 + type: string + expr: _col2 + type: string + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: srcbucket2 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + COLUMN_STATS_ACCURATE true + bucket_count 4 + bucket_field_name key + columns key,value + columns.types int:string +#### A masked pattern was here #### + name default.srcbucket2 + numFiles 4 + numRows 0 + rawDataSize 0 + serialization.ddl struct srcbucket2 { i32 key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + COLUMN_STATS_ACCURATE true + bucket_count 4 + bucket_field_name key + columns key,value + columns.types int:string +#### A masked pattern was here #### + name default.srcbucket2 + numFiles 4 + numRows 0 + rawDataSize 0 + serialization.ddl struct srcbucket2 { i32 key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.srcbucket2 + name: default.srcbucket2 + Truncated Path -> Alias: + /srcbucket2 [srcbucket2] + Needs Tagging: false + Reduce Operator Tree: + Extract + Statistics: + numRows: 9 dataSize: 951 basicStatsState: COMPLETE colStatsState: NONE + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + Statistics: + numRows: 9 dataSize: 951 basicStatsState: COMPLETE colStatsState: NONE +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1,_col2 + columns.types int:string:string + escape.delim \ + hive.serialization.extend.nesting.levels true + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + + Stage: Stage-0 + Fetch Operator + limit: -1 + +PREHOOK: query: select key, value, reflect2(INPUT__FILE__NAME, "replaceAll", ".*/", "") as filename from srcbucket2 where INPUT__FILE__NAME rlike '.*/srcbucket2[03].txt' AND key < 100 order by filename +PREHOOK: type: QUERY +PREHOOK: Input: default@srcbucket2 +#### A masked pattern was here #### +POSTHOOK: query: select key, value, reflect2(INPUT__FILE__NAME, "replaceAll", ".*/", "") as filename from srcbucket2 where INPUT__FILE__NAME rlike '.*/srcbucket2[03].txt' AND key < 100 order by filename +POSTHOOK: type: QUERY +POSTHOOK: Input: default@srcbucket2 +#### A masked pattern was here #### +37 val_37 srcbucket20.txt +66 val_66 srcbucket20.txt +37 val_37 srcbucket20.txt +15 val_15 srcbucket20.txt +0 val_0 srcbucket20.txt +4 val_4 srcbucket20.txt +51 val_51 srcbucket20.txt +84 val_84 srcbucket20.txt +8 val_8 srcbucket20.txt +0 val_0 srcbucket20.txt +26 val_26 srcbucket20.txt +51 val_51 srcbucket20.txt +95 val_95 srcbucket20.txt +77 val_77 srcbucket20.txt +0 val_0 srcbucket20.txt +15 val_15 srcbucket20.txt +19 val_19 srcbucket20.txt +95 val_95 srcbucket20.txt +11 val_11 srcbucket20.txt +33 val_33 srcbucket20.txt +80 val_80 srcbucket20.txt +44 val_44 srcbucket20.txt +26 val_26 srcbucket20.txt +84 val_84 srcbucket20.txt +72 val_72 srcbucket23.txt +54 val_54 srcbucket23.txt +65 val_65 srcbucket23.txt +83 val_83 srcbucket23.txt +58 val_58 srcbucket23.txt +43 val_43 srcbucket23.txt +98 val_98 srcbucket23.txt +87 val_87 srcbucket23.txt +72 val_72 srcbucket23.txt +90 val_90 srcbucket23.txt +10 val_10 srcbucket23.txt +58 val_58 srcbucket23.txt +76 val_76 srcbucket23.txt +76 val_76 srcbucket23.txt +69 val_69 srcbucket23.txt +90 val_90 srcbucket23.txt +83 val_83 srcbucket23.txt +18 val_18 srcbucket23.txt +18 val_18 srcbucket23.txt +90 val_90 srcbucket23.txt +98 val_98 srcbucket23.txt +47 val_47 srcbucket23.txt +PREHOOK: query: explain extended +select reflect2(a.INPUT__FILE__NAME, "replaceAll", ".*/", ""),a.key,reflect2(b.INPUT__FILE__NAME, "replaceAll", ".*/", ""),b.key from + (select key,INPUT__FILE__NAME from srcbucket2 where INPUT__FILE__NAME rlike '.*/srcbucket2[03].txt' AND key < 100) a left outer join + (select key,INPUT__FILE__NAME from srcbucket2 where INPUT__FILE__NAME rlike '.*/srcbucket2[02].txt' AND key < 100) b on (a.key=b.key+1) +PREHOOK: type: QUERY +POSTHOOK: query: explain extended +select reflect2(a.INPUT__FILE__NAME, "replaceAll", ".*/", ""),a.key,reflect2(b.INPUT__FILE__NAME, "replaceAll", ".*/", ""),b.key from + (select key,INPUT__FILE__NAME from srcbucket2 where INPUT__FILE__NAME rlike '.*/srcbucket2[03].txt' AND key < 100) a left outer join + (select key,INPUT__FILE__NAME from srcbucket2 where INPUT__FILE__NAME rlike '.*/srcbucket2[02].txt' AND key < 100) b on (a.key=b.key+1) +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_LEFTOUTERJOIN (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME srcbucket2))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL INPUT__FILE__NAME))) (TOK_WHERE (AND (rlike (TOK_TABLE_OR_COL INPUT__FILE__NAME) '.*/srcbucket2[03].txt') (< (TOK_TABLE_OR_COL key) 100))))) a) (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME srcbucket2))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL INPUT__FILE__NAME))) (TOK_WHERE (AND (rlike (TOK_TABLE_OR_COL INPUT__FILE__NAME) '.*/srcbucket2[02].txt') (< (TOK_TABLE_OR_COL key) 100))))) b) (= (. (TOK_TABLE_OR_COL a) key) (+ (. (TOK_TABLE_OR_COL b) key) 1)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTION reflect2 (. (TOK_TABLE_OR_COL a) INPUT__FILE__NAME) "replaceAll" ".*/" "")) (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key)) (TOK_SELEXPR (TOK_FUNCTION reflect2 (. (TOK_TABLE_OR_COL b) INPUT__FILE__NAME) "replaceAll" ".*/" "")) (TOK_SELEXPR (. (TOK_TABLE_OR_COL b) key))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + a:srcbucket2 + TableScan + alias: srcbucket2 + fileFilterExpr: + expr: (INPUT__FILE__NAME rlike '.*/srcbucket2[03].txt') + type: boolean + Statistics: + numRows: 1453 dataSize: 5812 basicStatsState: COMPLETE colStatsState: NONE + GatherStats: false + Filter Operator + isSamplingPred: false + predicate: + expr: ((INPUT__FILE__NAME rlike '.*/srcbucket2[03].txt') and (key < 100)) + type: boolean + Statistics: + numRows: 242 dataSize: 968 basicStatsState: COMPLETE colStatsState: NONE + Select Operator + expressions: + expr: key + type: int + expr: INPUT__FILE__NAME + type: string + outputColumnNames: _col0, _col1 + Statistics: + numRows: 242 dataSize: 968 basicStatsState: COMPLETE colStatsState: NONE + Reduce Output Operator + key expressions: + expr: _col0 + type: int + sort order: + + Map-reduce partition columns: + expr: _col0 + type: int + Statistics: + numRows: 242 dataSize: 968 basicStatsState: COMPLETE colStatsState: NONE + tag: 0 + value expressions: + expr: _col0 + type: int + expr: _col1 + type: string + b:srcbucket2 + TableScan + alias: srcbucket2 + fileFilterExpr: + expr: (INPUT__FILE__NAME rlike '.*/srcbucket2[02].txt') + type: boolean + Statistics: + numRows: 1453 dataSize: 5812 basicStatsState: COMPLETE colStatsState: NONE + GatherStats: false + Filter Operator + isSamplingPred: false + predicate: + expr: ((INPUT__FILE__NAME rlike '.*/srcbucket2[02].txt') and (key < 100)) + type: boolean + Statistics: + numRows: 242 dataSize: 968 basicStatsState: COMPLETE colStatsState: NONE + Select Operator + expressions: + expr: key + type: int + expr: INPUT__FILE__NAME + type: string + outputColumnNames: _col0, _col1 + Statistics: + numRows: 242 dataSize: 968 basicStatsState: COMPLETE colStatsState: NONE + Reduce Output Operator + key expressions: + expr: (_col0 + 1) + type: int + sort order: + + Map-reduce partition columns: + expr: (_col0 + 1) + type: int + Statistics: + numRows: 242 dataSize: 968 basicStatsState: COMPLETE colStatsState: NONE + tag: 1 + value expressions: + expr: _col0 + type: int + expr: _col1 + type: string + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: srcbucket2 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + COLUMN_STATS_ACCURATE true + bucket_count 4 + bucket_field_name key + columns key,value + columns.types int:string +#### A masked pattern was here #### + name default.srcbucket2 + numFiles 4 + numRows 0 + rawDataSize 0 + serialization.ddl struct srcbucket2 { i32 key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + COLUMN_STATS_ACCURATE true + bucket_count 4 + bucket_field_name key + columns key,value + columns.types int:string +#### A masked pattern was here #### + name default.srcbucket2 + numFiles 4 + numRows 0 + rawDataSize 0 + serialization.ddl struct srcbucket2 { i32 key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.srcbucket2 + name: default.srcbucket2 + Truncated Path -> Alias: + /srcbucket2 [b:srcbucket2, a:srcbucket2] + Needs Tagging: true + Reduce Operator Tree: + Join Operator + condition map: + Left Outer Join0 to 1 + condition expressions: + 0 {VALUE._col0} {VALUE._col1} + 1 {VALUE._col0} {VALUE._col1} + handleSkewJoin: false + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: + numRows: 266 dataSize: 1064 basicStatsState: COMPLETE colStatsState: NONE + Select Operator + expressions: + expr: reflect2(_col1,'replaceAll','.*/','') + type: string + expr: _col0 + type: int + expr: reflect2(_col3,'replaceAll','.*/','') + type: string + expr: _col2 + type: int + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: + numRows: 266 dataSize: 1064 basicStatsState: COMPLETE colStatsState: NONE + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + Statistics: + numRows: 266 dataSize: 1064 basicStatsState: COMPLETE colStatsState: NONE +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1,_col2,_col3 + columns.types string:int:string:int + escape.delim \ + hive.serialization.extend.nesting.levels true + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + + Stage: Stage-0 + Fetch Operator + limit: -1 + +PREHOOK: query: select reflect2(a.INPUT__FILE__NAME, "replaceAll", ".*/", ""),a.key,reflect2(b.INPUT__FILE__NAME, "replaceAll", ".*/", ""),b.key from + (select key,INPUT__FILE__NAME from srcbucket2 where INPUT__FILE__NAME rlike '.*/srcbucket2[03].txt' AND key < 100) a left outer join + (select key,INPUT__FILE__NAME from srcbucket2 where INPUT__FILE__NAME rlike '.*/srcbucket2[02].txt' AND key < 100) b on (a.key=b.key+1) +PREHOOK: type: QUERY +PREHOOK: Input: default@srcbucket2 +#### A masked pattern was here #### +POSTHOOK: query: select reflect2(a.INPUT__FILE__NAME, "replaceAll", ".*/", ""),a.key,reflect2(b.INPUT__FILE__NAME, "replaceAll", ".*/", ""),b.key from + (select key,INPUT__FILE__NAME from srcbucket2 where INPUT__FILE__NAME rlike '.*/srcbucket2[03].txt' AND key < 100) a left outer join + (select key,INPUT__FILE__NAME from srcbucket2 where INPUT__FILE__NAME rlike '.*/srcbucket2[02].txt' AND key < 100) b on (a.key=b.key+1) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@srcbucket2 +#### A masked pattern was here #### +srcbucket20.txt 0 NULL NULL +srcbucket20.txt 0 NULL NULL +srcbucket20.txt 0 NULL NULL +srcbucket20.txt 4 NULL NULL +srcbucket20.txt 8 NULL NULL +srcbucket23.txt 10 NULL NULL +srcbucket20.txt 11 NULL NULL +srcbucket20.txt 15 NULL NULL +srcbucket20.txt 15 NULL NULL +srcbucket23.txt 18 srcbucket22.txt 17 +srcbucket23.txt 18 srcbucket22.txt 17 +srcbucket20.txt 19 NULL NULL +srcbucket20.txt 26 NULL NULL +srcbucket20.txt 26 NULL NULL +srcbucket20.txt 33 NULL NULL +srcbucket20.txt 37 NULL NULL +srcbucket20.txt 37 NULL NULL +srcbucket23.txt 43 srcbucket22.txt 42 +srcbucket23.txt 43 srcbucket22.txt 42 +srcbucket20.txt 44 NULL NULL +srcbucket23.txt 47 NULL NULL +srcbucket20.txt 51 NULL NULL +srcbucket20.txt 51 NULL NULL +srcbucket23.txt 54 srcbucket22.txt 53 +srcbucket23.txt 58 srcbucket22.txt 57 +srcbucket23.txt 58 srcbucket22.txt 57 +srcbucket23.txt 65 srcbucket22.txt 64 +srcbucket20.txt 66 NULL NULL +srcbucket23.txt 69 NULL NULL +srcbucket23.txt 72 NULL NULL +srcbucket23.txt 72 NULL NULL +srcbucket23.txt 76 NULL NULL +srcbucket23.txt 76 NULL NULL +srcbucket20.txt 77 NULL NULL +srcbucket20.txt 80 NULL NULL +srcbucket23.txt 83 srcbucket22.txt 82 +srcbucket23.txt 83 srcbucket22.txt 82 +srcbucket20.txt 84 NULL NULL +srcbucket20.txt 84 NULL NULL +srcbucket23.txt 87 srcbucket22.txt 86 +srcbucket23.txt 90 NULL NULL +srcbucket23.txt 90 NULL NULL +srcbucket23.txt 90 NULL NULL +srcbucket20.txt 95 NULL NULL +srcbucket20.txt 95 NULL NULL +srcbucket23.txt 98 srcbucket22.txt 97 +srcbucket23.txt 98 srcbucket22.txt 97 +srcbucket23.txt 98 srcbucket22.txt 97 +srcbucket23.txt 98 srcbucket22.txt 97