diff --git common/src/java/org/apache/hadoop/hive/conf/HiveConf.java common/src/java/org/apache/hadoop/hive/conf/HiveConf.java index 388a604..56a0cf8 100644 --- common/src/java/org/apache/hadoop/hive/conf/HiveConf.java +++ common/src/java/org/apache/hadoop/hive/conf/HiveConf.java @@ -597,6 +597,7 @@ HIVEOPTPPD("hive.optimize.ppd", true), // predicate pushdown HIVEPPDRECOGNIZETRANSITIVITY("hive.ppd.recognizetransivity", true), // predicate pushdown HIVEPPDREMOVEDUPLICATEFILTERS("hive.ppd.remove.duplicatefilters", true), + HIVEPPDFILES("hive.optimize.ppd.vc.filename", false), HIVEMETADATAONLYQUERIES("hive.optimize.metadataonly", true), // push predicates down to storage handlers HIVEOPTPPD_STORAGE("hive.optimize.ppd.storage", true), diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java index b9a5d38..8ad8874 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java @@ -1453,14 +1453,25 @@ public static Method getMethodInternal(Class udfClass, List mlist, bo * out of array and getting values out of map. */ public static GenericUDF getGenericUDFForIndex() { - return FunctionRegistry.getFunctionInfo("index").getGenericUDF(); + return getGenericUDF("index"); } /** * A shortcut to get the "and" GenericUDF. */ public static GenericUDF getGenericUDFForAnd() { - return FunctionRegistry.getFunctionInfo("and").getGenericUDF(); + return getGenericUDF("and"); + } + + /** + * A shortcut to get the "or" GenericUDF. + */ + public static GenericUDF getGenericUDFForOr() { + return getGenericUDF("or"); + } + + public static GenericUDF getGenericUDF(String name) { + return FunctionRegistry.getFunctionInfo(name).getGenericUDF(); } /** diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java index 5995c14..52a3d39 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java @@ -2960,13 +2960,18 @@ public static double getHighestSamplePercentage (MapWork work) { Set pathsProcessed = new HashSet(); List pathsToAdd = new LinkedList(); // AliasToWork contains all the aliases - for (String alias : work.getAliasToWork().keySet()) { + Map> aliasToWork = work.getAliasToWork(); + Map> pathToAliases = + new LinkedHashMap>(work.getPathToAliases()); + + for (String alias : aliasToWork.keySet()) { LOG.info("Processing alias " + alias); // The alias may not have any path Path path = null; - for (String file : new LinkedList(work.getPathToAliases().keySet())) { - List aliases = work.getPathToAliases().get(file); + for (Map.Entry> entry : pathToAliases.entrySet()) { + String file = entry.getKey(); + List aliases = entry.getValue(); if (aliases.contains(alias)) { path = new Path(file); @@ -2979,13 +2984,16 @@ public static double getHighestSamplePercentage (MapWork work) { pathsProcessed.add(path); LOG.info("Adding input file " + path); + + List adding; if (!HiveConf.getVar(job, ConfVars.HIVE_EXECUTION_ENGINE).equals("tez") && isEmptyPath(job, path, ctx)) { - path = createDummyFileForEmptyPartition(path, job, work, - hiveScratchDir, alias, sequenceNumber++); - + adding = Arrays.asList(createDummyFileForEmptyPartition(path, job, work, + hiveScratchDir, alias, sequenceNumber++)); + } else { + adding = work.getPathsFor(path, ctx.getConf()); } - pathsToAdd.add(path); + pathsToAdd.addAll(adding); } } diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/mr/ExecDriver.java ql/src/java/org/apache/hadoop/hive/ql/exec/mr/ExecDriver.java index 288da8e..ff181a9 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/mr/ExecDriver.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/mr/ExecDriver.java @@ -36,6 +36,7 @@ import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.filecache.DistributedCache; +import org.apache.hadoop.fs.ContentSummary; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; @@ -103,11 +104,13 @@ private static final long serialVersionUID = 1L; private static final String JOBCONF_FILENAME = "jobconf.xml"; - protected transient JobConf job; - public static MemoryMXBean memoryMXBean; + protected static transient final Log LOG = LogFactory.getLog(ExecDriver.class); + protected static MemoryMXBean memoryMXBean; + protected HadoopJobExecHelper jobExecHelper; - protected static transient final Log LOG = LogFactory.getLog(ExecDriver.class); + protected transient JobConf job; + protected transient ContentSummary inputSummary; private RunningJob rj; @@ -120,6 +123,13 @@ public ExecDriver() { this.jobExecHelper = new HadoopJobExecHelper(job, console, this, this); } + public ContentSummary getInputSummary(Context ctx) throws IOException, HiveException { + if (inputSummary == null) { + inputSummary = getWork().getTotalSummary(ctx, job); + } + return inputSummary; + } + @Override public boolean requireLock() { return true; diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/mr/MapRedTask.java ql/src/java/org/apache/hadoop/hive/ql/exec/mr/MapRedTask.java index 326654f..aa31adb 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/mr/MapRedTask.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/mr/MapRedTask.java @@ -39,6 +39,7 @@ import org.apache.hadoop.hive.ql.exec.Operator; import org.apache.hadoop.hive.ql.exec.Utilities; import org.apache.hadoop.hive.ql.exec.Utilities.StreamPrinter; +import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.plan.MapredWork; import org.apache.hadoop.hive.ql.plan.OperatorDesc; import org.apache.hadoop.hive.ql.plan.ReduceWork; @@ -63,7 +64,6 @@ static final String HIVE_CHILD_CLIENT_DEBUG_OPTS = "HIVE_CHILD_CLIENT_DEBUG_OPTS"; static final String[] HIVE_SYS_PROP = {"build.dir", "build.dir.hive", "hive.query.id"}; - private transient ContentSummary inputSummary = null; private transient boolean runningViaChild = false; private transient long totalInputFileSize; @@ -88,15 +88,13 @@ public int execute(DriverContext driverContext) { } // estimate number of reducers - setNumberOfReducers(); + setNumberOfReducers(ctx); // auto-determine local mode if allowed if (!ctx.isLocalOnlyExecutionMode() && conf.getBoolVar(HiveConf.ConfVars.LOCALMODEAUTO)) { - if (inputSummary == null) { - inputSummary = Utilities.getInputSummary(driverContext.getCtx(), work.getMapWork(), null); - } + ContentSummary inputSummary = getInputSummary(ctx); // set the values of totalInputFileSize and totalInputNumFiles, estimating them // if percentage block sampling is being used @@ -373,7 +371,7 @@ public boolean reduceDone() { /** * Set the number of reducers for the mapred work. */ - private void setNumberOfReducers() throws IOException { + private void setNumberOfReducers(Context ctx) throws IOException, HiveException { ReduceWork rWork = work.getReduceWork(); // this is a temporary hack to fix things that are not fixed in the compiler Integer numReducersFromWork = rWork == null ? 0 : rWork.getNumReduceTasks(); @@ -392,9 +390,7 @@ private void setNumberOfReducers() throws IOException { .printInfo("Number of reduce tasks not specified. Defaulting to jobconf value of: " + reducers); } else { - if (inputSummary == null) { - inputSummary = Utilities.getInputSummary(driverContext.getCtx(), work.getMapWork(), null); - } + ContentSummary inputSummary = getInputSummary(ctx); int reducers = Utilities.estimateNumberOfReducers(conf, inputSummary, work.getMapWork(), work.isFinalMapRed()); rWork.setNumReduceTasks(reducers); diff --git ql/src/java/org/apache/hadoop/hive/ql/index/IndexPredicateAnalyzer.java ql/src/java/org/apache/hadoop/hive/ql/index/IndexPredicateAnalyzer.java index d39ee2e..3997db7 100644 --- ql/src/java/org/apache/hadoop/hive/ql/index/IndexPredicateAnalyzer.java +++ ql/src/java/org/apache/hadoop/hive/ql/index/IndexPredicateAnalyzer.java @@ -58,11 +58,22 @@ private final Set udfNames; private final Set allowedColumnNames; + private boolean allowAllColumns; + private boolean allowAllFunctions; + public IndexPredicateAnalyzer() { udfNames = new HashSet(); allowedColumnNames = new HashSet(); } + public void setAllowAllColumns(boolean allowAllColumns) { + this.allowAllColumns = allowAllColumns; + } + + public void setAllowAllFunctions(boolean allowAllFunctions) { + this.allowAllFunctions = allowAllFunctions; + } + /** * Registers a comparison operator as one which can be satisfied * by an index search. Unless this is called, analyzePredicate @@ -183,13 +194,13 @@ private ExprNodeDesc analyzeExpr( } String udfName = genericUDF.getUdfName(); - if (!udfNames.contains(genericUDF.getUdfName())) { + if (!allowAllFunctions && !udfNames.contains(genericUDF.getUdfName())) { return expr; } ExprNodeColumnDesc columnDesc = (ExprNodeColumnDesc) extracted[0]; ExprNodeConstantDesc constantDesc = (ExprNodeConstantDesc) extracted[1]; - if (!allowedColumnNames.contains(columnDesc.getColumn())) { + if (!allowAllColumns && !allowedColumnNames.contains(columnDesc.getColumn())) { return expr; } diff --git ql/src/java/org/apache/hadoop/hive/ql/io/HiveFileFormatUtils.java ql/src/java/org/apache/hadoop/hive/ql/io/HiveFileFormatUtils.java index 95db96b..8f8153f 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/HiveFileFormatUtils.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/HiveFileFormatUtils.java @@ -19,7 +19,6 @@ package org.apache.hadoop.hive.ql.io; import java.io.IOException; -import java.io.Serializable; import java.util.ArrayList; import java.util.HashMap; import java.util.List; @@ -34,7 +33,6 @@ import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.ql.exec.Operator; import org.apache.hadoop.hive.ql.exec.Utilities; -import org.apache.hadoop.hive.ql.io.HivePassThroughOutputFormat; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.plan.FileSinkDesc; import org.apache.hadoop.hive.ql.plan.OperatorDesc; @@ -189,7 +187,7 @@ public static synchronized void registerInputFormatChecker( * get an InputFormatChecker for a file format. */ public static synchronized Class getInputFormatChecker( - Class inputFormat) { + Class inputFormat) { Class result = inputFormatCheckerMap .get(inputFormat); return result; @@ -427,7 +425,6 @@ private static String getMatchingPath(Map> pathToAlias if (foundAlias(pathToAliases, dirPath)) { return dirPath; } - path = dirPath; String dirStr = dir.toString(); int dirPathIndex = dirPath.lastIndexOf(Path.SEPARATOR); diff --git ql/src/java/org/apache/hadoop/hive/ql/io/HiveInputFormat.java ql/src/java/org/apache/hadoop/hive/ql/io/HiveInputFormat.java index 647a9a6..c572dd6 100755 --- ql/src/java/org/apache/hadoop/hive/ql/io/HiveInputFormat.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/HiveInputFormat.java @@ -327,7 +327,8 @@ private void addSplitsForGroup(List dirs, TableScanOperator tableScan, Job // for each dir, get the InputFormat, and do getSplits. for (Path dir : dirs) { - PartitionDesc part = getPartitionDescFromPath(pathToPartitionInfo, dir); + PartitionDesc part = HiveFileFormatUtils. + getPartitionDescFromPathRecursively(pathToPartitionInfo, dir, null); Class inputFormatClass = part.getInputFileFormatClass(); TableDesc table = part.getTableDesc(); TableScanOperator tableScan = null; diff --git ql/src/java/org/apache/hadoop/hive/ql/metadata/FilePrunningPredicateHandler.java ql/src/java/org/apache/hadoop/hive/ql/metadata/FilePrunningPredicateHandler.java new file mode 100644 index 0000000..4420856 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/metadata/FilePrunningPredicateHandler.java @@ -0,0 +1,53 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.metadata; + +import java.util.ArrayList; +import java.util.List; + +import org.apache.hadoop.hive.ql.index.IndexPredicateAnalyzer; +import org.apache.hadoop.hive.ql.index.IndexSearchCondition; +import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; +import org.apache.hadoop.hive.serde2.Deserializer; +import org.apache.hadoop.mapred.JobConf; + +/** + * Extracts file pruning expression (used by MapredWork) + */ +public class FilePrunningPredicateHandler implements HiveStoragePredicateHandler { + + @Override + public DecomposedPredicate decomposePredicate(JobConf jobConf, Deserializer deserializer, + ExprNodeDesc predicate) { + IndexPredicateAnalyzer analyzer = new IndexPredicateAnalyzer(); + analyzer.allowColumnName(VirtualColumn.FILENAME.getName()); + analyzer.setAllowAllFunctions(true); + + List searchConditions = new ArrayList(); + ExprNodeDesc residual = analyzer.analyzePredicate(predicate, searchConditions); + + // there is no assertion that pushedPredicate to be applied always, so residue all of them. + DecomposedPredicate decomposedPredicate = new DecomposedPredicate(); + decomposedPredicate.pushedPredicate = analyzer.translateSearchConditions(searchConditions); + + // keep all (todo: can be removed when it's referenced by one alias) + decomposedPredicate.residualPredicate = predicate; + return decomposedPredicate; + } +} diff --git ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveStoragePredicateHandler.java ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveStoragePredicateHandler.java index 9f35575..df56aa9 100644 --- ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveStoragePredicateHandler.java +++ ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveStoragePredicateHandler.java @@ -18,10 +18,9 @@ package org.apache.hadoop.hive.ql.metadata; +import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; -import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc; import org.apache.hadoop.hive.serde2.Deserializer; -import org.apache.hadoop.mapred.JobConf; /** * HiveStoragePredicateHandler is an optional companion to {@link @@ -66,12 +65,12 @@ public DecomposedPredicate decomposePredicate( * Portion of predicate to be evaluated by storage handler. Hive * will pass this into the storage handler's input format. */ - public ExprNodeGenericFuncDesc pushedPredicate; + public ExprNodeDesc pushedPredicate; /** * Portion of predicate to be post-evaluated by Hive for any rows * which are returned by storage handler. */ - public ExprNodeGenericFuncDesc residualPredicate; + public ExprNodeDesc residualPredicate; } } diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/AbstractJoinTaskDispatcher.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/AbstractJoinTaskDispatcher.java index 33ef581..177ff2d 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/AbstractJoinTaskDispatcher.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/AbstractJoinTaskDispatcher.java @@ -124,10 +124,6 @@ public long getTotalKnownInputSize(Context context, MapWork currWork, Map> pathToAliases, HashMap aliasToSize) throws SemanticException { try { - // go over all the input paths, and calculate a known total size, known - // size for each input alias. - Utilities.getInputSummary(context, currWork, null).getLength(); - // set alias to size mapping, this can be used to determine if one table // is chosen as big table, what's the total size of left tables, which // are going to be small tables. @@ -135,7 +131,7 @@ public long getTotalKnownInputSize(Context context, MapWork currWork, for (Map.Entry> entry : pathToAliases.entrySet()) { String path = entry.getKey(); List aliasList = entry.getValue(); - ContentSummary cs = context.getCS(path); + ContentSummary cs = currWork.getSummaryFor(path, context.getConf()); if (cs != null) { long size = cs.getLength(); for (String alias : aliasList) { diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/index/IndexWhereProcessor.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/index/IndexWhereProcessor.java index 5c6751c..a665345 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/index/IndexWhereProcessor.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/index/IndexWhereProcessor.java @@ -18,7 +18,6 @@ package org.apache.hadoop.hive.ql.optimizer.physical.index; -import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.LinkedHashSet; @@ -33,7 +32,6 @@ import org.apache.hadoop.hive.metastore.api.Index; import org.apache.hadoop.hive.ql.exec.TableScanOperator; import org.apache.hadoop.hive.ql.exec.Task; -import org.apache.hadoop.hive.ql.exec.Utilities; import org.apache.hadoop.hive.ql.exec.mr.MapRedTask; import org.apache.hadoop.hive.ql.hooks.ReadEntity; import org.apache.hadoop.hive.ql.index.HiveIndexHandler; @@ -204,14 +202,15 @@ private void rewriteForIndexes(ExprNodeDesc predicate, List indexes, } // check the size + MapredWork work = task.getWork(); try { - ContentSummary inputSummary = Utilities.getInputSummary(pctx.getContext(), task.getWork().getMapWork(), null); + ContentSummary inputSummary = work.getTotalSummary(pctx.getContext(), pctx.getConf()); long inputSize = inputSummary.getLength(); if (!indexHandler.checkQuerySize(inputSize, pctx.getConf())) { queryContext.setQueryTasks(null); return; } - } catch (IOException e) { + } catch (Exception e) { throw new SemanticException("Failed to get task size", e); } diff --git ql/src/java/org/apache/hadoop/hive/ql/parse/MapReduceCompiler.java ql/src/java/org/apache/hadoop/hive/ql/parse/MapReduceCompiler.java index 76f5a31..bd59711 100644 --- ql/src/java/org/apache/hadoop/hive/ql/parse/MapReduceCompiler.java +++ ql/src/java/org/apache/hadoop/hive/ql/parse/MapReduceCompiler.java @@ -18,7 +18,6 @@ package org.apache.hadoop.hive.ql.parse; -import java.io.IOException; import java.io.Serializable; import java.util.ArrayList; import java.util.HashMap; @@ -222,9 +221,9 @@ public boolean accept(Path file) { boolean hasNonLocalJob = false; for (ExecDriver mrtask : mrtasks) { try { - ContentSummary inputSummary = Utilities.getInputSummary - (ctx, ((MapredWork) mrtask.getWork()).getMapWork(), p); - int numReducers = getNumberOfReducers(mrtask.getWork(), conf); + MapredWork work = mrtask.getWork(); + ContentSummary inputSummary = work.getTotalSummary(lCtx, ctx.getConf()); + int numReducers = getNumberOfReducers(work, conf); long estimatedInput; @@ -257,7 +256,7 @@ public boolean accept(Path file) { } else { mrtask.setLocalMode(true); } - } catch (IOException e) { + } catch (Exception e) { throw new SemanticException(e); } } diff --git ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeDescUtils.java ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeDescUtils.java index e50026b..70ee0a3 100644 --- ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeDescUtils.java +++ ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeDescUtils.java @@ -19,6 +19,7 @@ package org.apache.hadoop.hive.ql.plan; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; import java.util.Map; @@ -32,6 +33,7 @@ import org.apache.hadoop.hive.ql.udf.generic.GenericUDFBridge; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; +import org.apache.hadoop.hive.serde.serdeConstants; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; import org.apache.hadoop.util.ReflectionUtils; @@ -99,10 +101,30 @@ public static boolean containsPredicate(ExprNodeDesc source, ExprNodeDesc predic return false; } + public static ExprNodeGenericFuncDesc toPredicate(ExprNodeDesc predicate) { + if (predicate instanceof ExprNodeGenericFuncDesc) { + return (ExprNodeGenericFuncDesc) predicate; + } + return new ExprNodeGenericFuncDesc(TypeInfoFactory.booleanTypeInfo, + FunctionRegistry.getGenericUDF(serdeConstants.BOOLEAN_TYPE_NAME), + new ArrayList(Arrays.asList(predicate))); + } + + /** + * bind two predicates by OR op + */ + public static ExprNodeDesc orPredicates(ExprNodeDesc prev, ExprNodeDesc next) { + List children = new ArrayList(2); + children.add(prev); + children.add(next); + return new ExprNodeGenericFuncDesc(TypeInfoFactory.booleanTypeInfo, + FunctionRegistry.getGenericUDFForOr(), children); + } + /** * bind two predicates by AND op */ - public static ExprNodeGenericFuncDesc mergePredicates(ExprNodeDesc prev, ExprNodeDesc next) { + public static ExprNodeDesc andPredicates(ExprNodeDesc prev, ExprNodeDesc next) { List children = new ArrayList(2); children.add(prev); children.add(next); @@ -120,7 +142,7 @@ public static ExprNodeDesc mergePredicates(List exprs) { prev = expr; continue; } - prev = mergePredicates(prev, expr); + prev = andPredicates(prev, expr); } return prev; } @@ -133,7 +155,7 @@ public static ExprNodeDesc mergePredicates(List exprs) { } /** - * split predicates by AND op + * split 'current' by AND op and append to 'splitted' */ public static List split(ExprNodeDesc current, List splitted) { if (FunctionRegistry.isOpAnd(current)) { diff --git ql/src/java/org/apache/hadoop/hive/ql/plan/MapWork.java ql/src/java/org/apache/hadoop/hive/ql/plan/MapWork.java index e1cc3f4..6473a13 100644 --- ql/src/java/org/apache/hadoop/hive/ql/plan/MapWork.java +++ ql/src/java/org/apache/hadoop/hive/ql/plan/MapWork.java @@ -18,9 +18,12 @@ package org.apache.hadoop.hive.ql.plan; +import java.io.FileNotFoundException; +import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; +import java.util.Collections; import java.util.HashMap; import java.util.Iterator; import java.util.LinkedHashMap; @@ -32,16 +35,33 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.ContentSummary; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.ql.Context; +import org.apache.hadoop.hive.ql.exec.ExprNodeEvaluator; +import org.apache.hadoop.hive.ql.exec.ExprNodeEvaluatorFactory; import org.apache.hadoop.hive.ql.exec.FileSinkOperator; import org.apache.hadoop.hive.ql.exec.Operator; import org.apache.hadoop.hive.ql.exec.OperatorUtils; +import org.apache.hadoop.hive.ql.exec.TableScanOperator; +import org.apache.hadoop.hive.ql.io.ContentSummaryInputFormat; +import org.apache.hadoop.hive.ql.io.HiveInputFormat; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.metadata.VirtualColumn; import org.apache.hadoop.hive.ql.optimizer.physical.BucketingSortingCtx.BucketCol; import org.apache.hadoop.hive.ql.optimizer.physical.BucketingSortingCtx.SortCol; import org.apache.hadoop.hive.ql.parse.OpParseContext; import org.apache.hadoop.hive.ql.parse.QBJoinTree; import org.apache.hadoop.hive.ql.parse.SplitSample; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.BooleanObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; +import org.apache.hadoop.mapred.InputFormat; import org.apache.hadoop.mapred.JobConf; /** @@ -531,4 +551,220 @@ public void setVectorMode(boolean vectorMode) { this.vectorMode = vectorMode; } + private static final ObjectInspector VC_FILE_OI = + ObjectInspectorFactory.getStandardStructObjectInspector( + Arrays.asList(VirtualColumn.FILENAME.getName()), + Arrays.asList((ObjectInspector) PrimitiveObjectInspectorFactory.javaStringObjectInspector)); + + private static final BooleanObjectInspector EVAL_OI = + PrimitiveObjectInspectorFactory.writableBooleanObjectInspector; + + private transient Map pathToEvals; + private transient Map summaries; + + public ContentSummary getSummaryFor(String path, Configuration conf) + throws IOException, HiveException { + return getInputSummaryFor(path, conf).toContentSummary(); + } + + // if file pruning is applied, return file paths passed the filter. if not, return input path + public List getPathsFor(String path, Configuration conf) + throws IOException, HiveException { + return getInputSummaryFor(path, conf).paths; + } + + public List getPathsFor(Path path, Configuration conf) + throws IOException, HiveException { + List paths = new ArrayList(); + for (String apath : getPathsFor(path.toString(), conf)) { + paths.add(new Path(apath)); + } + return paths; + } + + private InputSummary getInputSummaryFor(String path, Configuration conf) + throws IOException, HiveException { + InputSummary summary = summaries == null ? null : summaries.get(path); + if (summary == null) { + ExprNodeEvaluator evaluator = getPathToEvals().get(path); + if (summaries == null) { + summaries = new HashMap(); + } + summaries.put(path, summary = summarize(path, conf, evaluator)); + } + return summary; + } + + // get summaries for all input paths and return total of them + public ContentSummary getTotalSummary(Context ctx, Configuration conf) + throws IOException, HiveException { + long length = 0; + long fileCount = 0; + long directoryCount = 0; + for (String path : pathToAliases.keySet()) { + ContentSummary pathSummary = getSummaryFor(path, conf); + if (pathSummary != null) { + length += pathSummary.getLength(); + fileCount += pathSummary.getFileCount(); + directoryCount += pathSummary.getDirectoryCount(); + ctx.addCS(path, pathSummary); + } + } + return new ContentSummary(length, fileCount, directoryCount); + } + + // return or-conjuncted file pruning filter + private ExprNodeEvaluator toFilter(List> operators) throws HiveException { + ExprNodeDesc prev = null; + for (Operator operator : operators) { + if (operator instanceof TableScanOperator && operator.getConf() != null) { + ExprNodeDesc filterExpr = ((TableScanOperator) operator).getConf().getFileFilterExpr(); + if (filterExpr == null) { + continue; + } + if (prev == null) { + prev = filterExpr; + } else { + prev = ExprNodeDescUtils.orPredicates(prev, filterExpr); + } + } + } + if (prev == null) { + return null; + } + ExprNodeEvaluator evaluator = ExprNodeEvaluatorFactory.get(prev); + evaluator.initialize(VC_FILE_OI); + return evaluator; + } + + // evaluate input path with file pruning filter + private InputSummary summarize(String path, Configuration conf, ExprNodeEvaluator evaluator) + throws IOException, HiveException { + Path apath = new Path(path); + Class format = pathToPartitionInfo.get(path).getInputFileFormatClass(); + if (ContentSummaryInputFormat.class.isAssignableFrom(format)) { + JobConf jobConf = new JobConf(conf); + ContentSummaryInputFormat summaryInput = (ContentSummaryInputFormat) + HiveInputFormat.getInputFormatFromCache(format, jobConf); + ContentSummary content = summaryInput.getContentSummary(apath, jobConf); + return new InputSummary(path, content); + } + if (evaluator == null) { + FileSystem fs = apath.getFileSystem(conf); + if (fs.exists(apath)) { + return new InputSummary(path, fs.getContentSummary(apath)); + } + return new InputSummary(path); + } + InputSummary summary = new InputSummary(); + for (FileStatus inputStatus : getInputStatus(apath, conf)) { + String input = normalize(path, inputStatus.getPath().toString()); + Object evaluated = evaluator.evaluate(new String[]{input}); + if (EVAL_OI.get(evaluated)) { + summary.add(input, inputStatus); + } + } + if (summary.paths.isEmpty()) { + // not existing directory + summary.paths.add(path); + } + return summary; + } + + private String normalize(String path, String inputPath) { + int index = inputPath.lastIndexOf(path); + return path + inputPath.substring(index + path.length()); + } + + private List getInputStatus(Path path, Configuration conf) throws IOException { + FileSystem fs = path.getFileSystem(conf); + if (!fs.exists(path)) { + return Collections.emptyList(); + } + FileStatus status; + try { + status = fs.getFileStatus(path); + } catch (FileNotFoundException e) { + return Collections.emptyList(); + } + return iterate(fs, status, new ArrayList()); + } + + private List iterate(FileSystem fs, FileStatus status, List passed) + throws IOException { + if (status.isDir()) { + FileStatus[] children = fs.listStatus(status.getPath()); + if (children != null) { + for (FileStatus child : children) { + iterate(fs, child, passed); + } + } + } else { + passed.add(status); + } + return passed; + } + + private Map getPathToEvals() throws HiveException { + if (pathToEvals == null) { + pathToEvals = new HashMap(); + } + for (Map.Entry> entry : pathToAliases.entrySet()) { + pathToEvals.put(entry.getKey(), toFilter(getWorkForAliases(entry.getValue()))); + } + return pathToEvals; + } + + private List> getWorkForAliases(List aliases) { + List> operators = new ArrayList>(); + for (String alias : aliases) { + Operator work = aliasToWork.get(alias); + if (work == null) { + throw new IllegalStateException("Invalid alias " + alias); + } + operators.add(work); + } + return operators; + } + + private static class InputSummary { + + private boolean exists = false; + + private long length; + private long fileCount; + private long directoryCount; + private List paths; + + public InputSummary() { + paths = new ArrayList(); + } + + public InputSummary(String path) { + paths = Arrays.asList(path); + } + + public InputSummary(String path, ContentSummary content) { + this.paths = Arrays.asList(path); + this.length = content.getLength(); + this.fileCount = content.getFileCount(); + this.directoryCount = content.getDirectoryCount(); + this.exists = true; + } + + public void add(String path, FileStatus status) { + paths.add(path); + length += status.getLen(); + if (!status.isDir()) { + fileCount++; + } else { + directoryCount++; + } + exists = true; + } + + public ContentSummary toContentSummary() { + return !exists ? null : new ContentSummary(length, fileCount, directoryCount); + } + } } diff --git ql/src/java/org/apache/hadoop/hive/ql/plan/MapredWork.java ql/src/java/org/apache/hadoop/hive/ql/plan/MapredWork.java index f3203bf..800ddca 100644 --- ql/src/java/org/apache/hadoop/hive/ql/plan/MapredWork.java +++ ql/src/java/org/apache/hadoop/hive/ql/plan/MapredWork.java @@ -18,10 +18,15 @@ package org.apache.hadoop.hive.ql.plan; +import java.io.IOException; import java.util.ArrayList; import java.util.List; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.ContentSummary; +import org.apache.hadoop.hive.ql.Context; import org.apache.hadoop.hive.ql.exec.Operator; +import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.mapred.JobConf; @@ -80,4 +85,9 @@ public void configureJobConf(JobConf job) { return ops; } + + public ContentSummary getTotalSummary(Context ctx, Configuration conf) + throws IOException, HiveException { + return mapWork.getTotalSummary(ctx, conf); + } } diff --git ql/src/java/org/apache/hadoop/hive/ql/plan/TableScanDesc.java ql/src/java/org/apache/hadoop/hive/ql/plan/TableScanDesc.java index 10bae4d..3461efa 100644 --- ql/src/java/org/apache/hadoop/hive/ql/plan/TableScanDesc.java +++ ql/src/java/org/apache/hadoop/hive/ql/plan/TableScanDesc.java @@ -61,6 +61,7 @@ private int maxStatsKeyPrefixLength = -1; private ExprNodeGenericFuncDesc filterExpr; + private ExprNodeGenericFuncDesc fileFilterExpr; public static final String FILTER_EXPR_CONF_STR = "hive.io.filter.expr.serialized"; @@ -110,6 +111,15 @@ public void setFilterExpr(ExprNodeGenericFuncDesc filterExpr) { this.filterExpr = filterExpr; } + @Explain(displayName = "fileFilterExpr") + public ExprNodeGenericFuncDesc getFileFilterExpr() { + return fileFilterExpr; + } + + public void setFileFilterExpr(ExprNodeGenericFuncDesc fileFilterExpr) { + this.fileFilterExpr = fileFilterExpr; + } + public void setAlias(String alias) { this.alias = alias; } diff --git ql/src/java/org/apache/hadoop/hive/ql/ppd/OpProcFactory.java ql/src/java/org/apache/hadoop/hive/ql/ppd/OpProcFactory.java index 40298e1..8d692a9 100644 --- ql/src/java/org/apache/hadoop/hive/ql/ppd/OpProcFactory.java +++ ql/src/java/org/apache/hadoop/hive/ql/ppd/OpProcFactory.java @@ -43,6 +43,7 @@ import org.apache.hadoop.hive.ql.lib.Node; import org.apache.hadoop.hive.ql.lib.NodeProcessor; import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx; +import org.apache.hadoop.hive.ql.metadata.FilePrunningPredicateHandler; import org.apache.hadoop.hive.ql.metadata.HiveStorageHandler; import org.apache.hadoop.hive.ql.metadata.HiveStoragePredicateHandler; import org.apache.hadoop.hive.ql.metadata.Table; @@ -773,7 +774,7 @@ protected static Object createFilter(Operator op, * by Hive as a post-filter, or null if it was possible * to push down the entire predicate */ - private static ExprNodeGenericFuncDesc pushFilterToStorageHandler( + private static ExprNodeDesc pushFilterToStorageHandler( TableScanOperator tableScanOp, ExprNodeGenericFuncDesc originalPredicate, OpWalkerInfo owi, @@ -787,11 +788,19 @@ private static ExprNodeGenericFuncDesc pushFilterToStorageHandler( // optimizations are applied tableScanDesc.setFilterExpr(originalPredicate); } - if (!tbl.isNonNative()) { + + HiveStoragePredicateHandler predicateHandler = null; + if (tbl.isNonNative()) { + HiveStorageHandler storageHandler = tbl.getStorageHandler(); + if (storageHandler instanceof HiveStoragePredicateHandler) { + predicateHandler = (HiveStoragePredicateHandler) storageHandler; + } + } else if (HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVEPPDFILES)) { + predicateHandler = new FilePrunningPredicateHandler(); + } else { return originalPredicate; } - HiveStorageHandler storageHandler = tbl.getStorageHandler(); - if (!(storageHandler instanceof HiveStoragePredicateHandler)) { + if (predicateHandler == null) { // The storage handler does not provide predicate decomposition // support, so we'll implement the entire filter in Hive. However, // we still provide the full predicate to the storage handler in @@ -799,8 +808,7 @@ private static ExprNodeGenericFuncDesc pushFilterToStorageHandler( tableScanDesc.setFilterExpr(originalPredicate); return originalPredicate; } - HiveStoragePredicateHandler predicateHandler = - (HiveStoragePredicateHandler) storageHandler; + JobConf jobConf = new JobConf(owi.getParseContext().getConf()); Utilities.setColumnNameList(jobConf, tableScanOp); Utilities.setColumnTypeList(jobConf, tableScanOp); @@ -835,8 +843,17 @@ private static ExprNodeGenericFuncDesc pushFilterToStorageHandler( + decomposed.residualPredicate.getExprString()); } } - tableScanDesc.setFilterExpr(decomposed.pushedPredicate); - return (ExprNodeGenericFuncDesc)decomposed.residualPredicate; + + if (decomposed.pushedPredicate != null) { + ExprNodeGenericFuncDesc predicate = + ExprNodeDescUtils.toPredicate(decomposed.pushedPredicate); + if (predicateHandler instanceof FilePrunningPredicateHandler) { + tableScanDesc.setFileFilterExpr(predicate); + } else { + tableScanDesc.setFilterExpr(predicate); + } + } + return decomposed.residualPredicate; } public static NodeProcessor getFilterProc() { diff --git ql/src/test/queries/clientpositive/file_pruning.q ql/src/test/queries/clientpositive/file_pruning.q new file mode 100644 index 0000000..f6bed1b --- /dev/null +++ ql/src/test/queries/clientpositive/file_pruning.q @@ -0,0 +1,18 @@ +set hive.optimize.ppd.vc.filename=true; +set hive.auto.convert.join=false; + +-- HIVE-1662 File Prunning by filter on INPUT__FILE__NAME(VC) +-- srcbucket2 has 4 files (srcbucket20, srcbucket21, srcbucket22, srcbucket23) +-- and selects srcbucket20 and srcbucket23 as input + +explain extended +select key, value, reflect2(INPUT__FILE__NAME, "replaceAll", ".*/", "") as filename from srcbucket2 where INPUT__FILE__NAME rlike '.*/srcbucket2[03].txt' AND key < 100 order by filename, key, value; +select key, value, reflect2(INPUT__FILE__NAME, "replaceAll", ".*/", "") as filename from srcbucket2 where INPUT__FILE__NAME rlike '.*/srcbucket2[03].txt' AND key < 100 order by filename, key, value; + +explain extended +select reflect2(a.INPUT__FILE__NAME, "replaceAll", ".*/", ""),a.key,reflect2(b.INPUT__FILE__NAME, "replaceAll", ".*/", ""),b.key from + (select key,INPUT__FILE__NAME from srcbucket2 where INPUT__FILE__NAME rlike '.*/srcbucket2[03].txt' AND key < 100) a left outer join + (select key,INPUT__FILE__NAME from srcbucket2 where INPUT__FILE__NAME rlike '.*/srcbucket2[02].txt' AND key < 100) b on (a.key=b.key+1); +select reflect2(a.INPUT__FILE__NAME, "replaceAll", ".*/", ""),a.key,reflect2(b.INPUT__FILE__NAME, "replaceAll", ".*/", ""),b.key from + (select key,INPUT__FILE__NAME from srcbucket2 where INPUT__FILE__NAME rlike '.*/srcbucket2[03].txt' AND key < 100) a left outer join + (select key,INPUT__FILE__NAME from srcbucket2 where INPUT__FILE__NAME rlike '.*/srcbucket2[02].txt' AND key < 100) b on (a.key=b.key+1); diff --git ql/src/test/results/clientpositive/file_pruning.q.out ql/src/test/results/clientpositive/file_pruning.q.out new file mode 100644 index 0000000..a6db2da --- /dev/null +++ ql/src/test/results/clientpositive/file_pruning.q.out @@ -0,0 +1,537 @@ +PREHOOK: query: -- HIVE-1662 File Prunning by filter on INPUT__FILE__NAME(VC) +-- srcbucket2 has 4 files (srcbucket20, srcbucket21, srcbucket22, srcbucket23) +-- and selects srcbucket20 and srcbucket23 as input + +explain extended +select key, value, reflect2(INPUT__FILE__NAME, "replaceAll", ".*/", "") as filename from srcbucket2 where INPUT__FILE__NAME rlike '.*/srcbucket2[03].txt' AND key < 100 order by filename, key, value +PREHOOK: type: QUERY +POSTHOOK: query: -- HIVE-1662 File Prunning by filter on INPUT__FILE__NAME(VC) +-- srcbucket2 has 4 files (srcbucket20, srcbucket21, srcbucket22, srcbucket23) +-- and selects srcbucket20 and srcbucket23 as input + +explain extended +select key, value, reflect2(INPUT__FILE__NAME, "replaceAll", ".*/", "") as filename from srcbucket2 where INPUT__FILE__NAME rlike '.*/srcbucket2[03].txt' AND key < 100 order by filename, key, value +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + +TOK_QUERY + TOK_FROM + TOK_TABREF + TOK_TABNAME + srcbucket2 + TOK_INSERT + TOK_DESTINATION + TOK_DIR + TOK_TMP_FILE + TOK_SELECT + TOK_SELEXPR + TOK_TABLE_OR_COL + key + TOK_SELEXPR + TOK_TABLE_OR_COL + value + TOK_SELEXPR + TOK_FUNCTION + reflect2 + TOK_TABLE_OR_COL + INPUT__FILE__NAME + "replaceAll" + ".*/" + "" + filename + TOK_WHERE + AND + rlike + TOK_TABLE_OR_COL + INPUT__FILE__NAME + '.*/srcbucket2[03].txt' + < + TOK_TABLE_OR_COL + key + 100 + TOK_ORDERBY + TOK_TABSORTCOLNAMEASC + TOK_TABLE_OR_COL + filename + TOK_TABSORTCOLNAMEASC + TOK_TABLE_OR_COL + key + TOK_TABSORTCOLNAMEASC + TOK_TABLE_OR_COL + value + + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: srcbucket2 + Statistics: Num rows: 55 Data size: 5812 Basic stats: COMPLETE Column stats: NONE + GatherStats: false + Filter Operator + isSamplingPred: false + predicate: ((INPUT__FILE__NAME rlike '.*/srcbucket2[03].txt') and (key < 100)) (type: boolean) + Statistics: Num rows: 9 Data size: 951 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: int), value (type: string), reflect2(INPUT__FILE__NAME,'replaceAll','.*/','') (type: string) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 9 Data size: 951 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col2 (type: string), _col0 (type: int), _col1 (type: string) + sort order: +++ + Statistics: Num rows: 9 Data size: 951 Basic stats: COMPLETE Column stats: NONE + tag: -1 + value expressions: _col0 (type: int), _col1 (type: string), _col2 (type: string) + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: srcbucket2 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + COLUMN_STATS_ACCURATE true + bucket_count 4 + bucket_field_name key + columns key,value + columns.types int:string +#### A masked pattern was here #### + name default.srcbucket2 + numFiles 4 + numRows 0 + rawDataSize 0 + serialization.ddl struct srcbucket2 { i32 key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + COLUMN_STATS_ACCURATE true + bucket_count 4 + bucket_field_name key + columns key,value + columns.types int:string +#### A masked pattern was here #### + name default.srcbucket2 + numFiles 4 + numRows 0 + rawDataSize 0 + serialization.ddl struct srcbucket2 { i32 key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.srcbucket2 + name: default.srcbucket2 + Truncated Path -> Alias: + /srcbucket2 [srcbucket2] + Needs Tagging: false + Reduce Operator Tree: + Extract + Statistics: Num rows: 9 Data size: 951 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + Statistics: Num rows: 9 Data size: 951 Basic stats: COMPLETE Column stats: NONE +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1,_col2 + columns.types int:string:string + escape.delim \ + hive.serialization.extend.nesting.levels true + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + + Stage: Stage-0 + Fetch Operator + limit: -1 + +PREHOOK: query: select key, value, reflect2(INPUT__FILE__NAME, "replaceAll", ".*/", "") as filename from srcbucket2 where INPUT__FILE__NAME rlike '.*/srcbucket2[03].txt' AND key < 100 order by filename, key, value +PREHOOK: type: QUERY +PREHOOK: Input: default@srcbucket2 +#### A masked pattern was here #### +POSTHOOK: query: select key, value, reflect2(INPUT__FILE__NAME, "replaceAll", ".*/", "") as filename from srcbucket2 where INPUT__FILE__NAME rlike '.*/srcbucket2[03].txt' AND key < 100 order by filename, key, value +POSTHOOK: type: QUERY +POSTHOOK: Input: default@srcbucket2 +#### A masked pattern was here #### +0 val_0 srcbucket20.txt +0 val_0 srcbucket20.txt +0 val_0 srcbucket20.txt +4 val_4 srcbucket20.txt +8 val_8 srcbucket20.txt +11 val_11 srcbucket20.txt +15 val_15 srcbucket20.txt +15 val_15 srcbucket20.txt +19 val_19 srcbucket20.txt +26 val_26 srcbucket20.txt +26 val_26 srcbucket20.txt +33 val_33 srcbucket20.txt +37 val_37 srcbucket20.txt +37 val_37 srcbucket20.txt +44 val_44 srcbucket20.txt +51 val_51 srcbucket20.txt +51 val_51 srcbucket20.txt +66 val_66 srcbucket20.txt +77 val_77 srcbucket20.txt +80 val_80 srcbucket20.txt +84 val_84 srcbucket20.txt +84 val_84 srcbucket20.txt +95 val_95 srcbucket20.txt +95 val_95 srcbucket20.txt +10 val_10 srcbucket23.txt +18 val_18 srcbucket23.txt +18 val_18 srcbucket23.txt +43 val_43 srcbucket23.txt +47 val_47 srcbucket23.txt +54 val_54 srcbucket23.txt +58 val_58 srcbucket23.txt +58 val_58 srcbucket23.txt +65 val_65 srcbucket23.txt +69 val_69 srcbucket23.txt +72 val_72 srcbucket23.txt +72 val_72 srcbucket23.txt +76 val_76 srcbucket23.txt +76 val_76 srcbucket23.txt +83 val_83 srcbucket23.txt +83 val_83 srcbucket23.txt +87 val_87 srcbucket23.txt +90 val_90 srcbucket23.txt +90 val_90 srcbucket23.txt +90 val_90 srcbucket23.txt +98 val_98 srcbucket23.txt +98 val_98 srcbucket23.txt +PREHOOK: query: explain extended +select reflect2(a.INPUT__FILE__NAME, "replaceAll", ".*/", ""),a.key,reflect2(b.INPUT__FILE__NAME, "replaceAll", ".*/", ""),b.key from + (select key,INPUT__FILE__NAME from srcbucket2 where INPUT__FILE__NAME rlike '.*/srcbucket2[03].txt' AND key < 100) a left outer join + (select key,INPUT__FILE__NAME from srcbucket2 where INPUT__FILE__NAME rlike '.*/srcbucket2[02].txt' AND key < 100) b on (a.key=b.key+1) +PREHOOK: type: QUERY +POSTHOOK: query: explain extended +select reflect2(a.INPUT__FILE__NAME, "replaceAll", ".*/", ""),a.key,reflect2(b.INPUT__FILE__NAME, "replaceAll", ".*/", ""),b.key from + (select key,INPUT__FILE__NAME from srcbucket2 where INPUT__FILE__NAME rlike '.*/srcbucket2[03].txt' AND key < 100) a left outer join + (select key,INPUT__FILE__NAME from srcbucket2 where INPUT__FILE__NAME rlike '.*/srcbucket2[02].txt' AND key < 100) b on (a.key=b.key+1) +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + +TOK_QUERY + TOK_FROM + TOK_LEFTOUTERJOIN + TOK_SUBQUERY + TOK_QUERY + TOK_FROM + TOK_TABREF + TOK_TABNAME + srcbucket2 + TOK_INSERT + TOK_DESTINATION + TOK_DIR + TOK_TMP_FILE + TOK_SELECT + TOK_SELEXPR + TOK_TABLE_OR_COL + key + TOK_SELEXPR + TOK_TABLE_OR_COL + INPUT__FILE__NAME + TOK_WHERE + AND + rlike + TOK_TABLE_OR_COL + INPUT__FILE__NAME + '.*/srcbucket2[03].txt' + < + TOK_TABLE_OR_COL + key + 100 + a + TOK_SUBQUERY + TOK_QUERY + TOK_FROM + TOK_TABREF + TOK_TABNAME + srcbucket2 + TOK_INSERT + TOK_DESTINATION + TOK_DIR + TOK_TMP_FILE + TOK_SELECT + TOK_SELEXPR + TOK_TABLE_OR_COL + key + TOK_SELEXPR + TOK_TABLE_OR_COL + INPUT__FILE__NAME + TOK_WHERE + AND + rlike + TOK_TABLE_OR_COL + INPUT__FILE__NAME + '.*/srcbucket2[02].txt' + < + TOK_TABLE_OR_COL + key + 100 + b + = + . + TOK_TABLE_OR_COL + a + key + + + . + TOK_TABLE_OR_COL + b + key + 1 + TOK_INSERT + TOK_DESTINATION + TOK_DIR + TOK_TMP_FILE + TOK_SELECT + TOK_SELEXPR + TOK_FUNCTION + reflect2 + . + TOK_TABLE_OR_COL + a + INPUT__FILE__NAME + "replaceAll" + ".*/" + "" + TOK_SELEXPR + . + TOK_TABLE_OR_COL + a + key + TOK_SELEXPR + TOK_FUNCTION + reflect2 + . + TOK_TABLE_OR_COL + b + INPUT__FILE__NAME + "replaceAll" + ".*/" + "" + TOK_SELEXPR + . + TOK_TABLE_OR_COL + b + key + + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: srcbucket2 + Statistics: Num rows: 1453 Data size: 5812 Basic stats: COMPLETE Column stats: NONE + GatherStats: false + Filter Operator + isSamplingPred: false + predicate: ((INPUT__FILE__NAME rlike '.*/srcbucket2[02].txt') and (key < 100)) (type: boolean) + Statistics: Num rows: 242 Data size: 968 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: int), INPUT__FILE__NAME (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 242 Data size: 968 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: (_col0 + 1) (type: int) + sort order: + + Map-reduce partition columns: (_col0 + 1) (type: int) + Statistics: Num rows: 242 Data size: 968 Basic stats: COMPLETE Column stats: NONE + tag: 1 + value expressions: _col0 (type: int), _col1 (type: string) + TableScan + alias: srcbucket2 + Statistics: Num rows: 1453 Data size: 5812 Basic stats: COMPLETE Column stats: NONE + GatherStats: false + Filter Operator + isSamplingPred: false + predicate: ((INPUT__FILE__NAME rlike '.*/srcbucket2[03].txt') and (key < 100)) (type: boolean) + Statistics: Num rows: 242 Data size: 968 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: int), INPUT__FILE__NAME (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 242 Data size: 968 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: int) + sort order: + + Map-reduce partition columns: _col0 (type: int) + Statistics: Num rows: 242 Data size: 968 Basic stats: COMPLETE Column stats: NONE + tag: 0 + value expressions: _col0 (type: int), _col1 (type: string) + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: srcbucket2 + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + COLUMN_STATS_ACCURATE true + bucket_count 4 + bucket_field_name key + columns key,value + columns.types int:string +#### A masked pattern was here #### + name default.srcbucket2 + numFiles 4 + numRows 0 + rawDataSize 0 + serialization.ddl struct srcbucket2 { i32 key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + COLUMN_STATS_ACCURATE true + bucket_count 4 + bucket_field_name key + columns key,value + columns.types int:string +#### A masked pattern was here #### + name default.srcbucket2 + numFiles 4 + numRows 0 + rawDataSize 0 + serialization.ddl struct srcbucket2 { i32 key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.srcbucket2 + name: default.srcbucket2 + Truncated Path -> Alias: + /srcbucket2 [b:srcbucket2, a:srcbucket2] + Needs Tagging: true + Reduce Operator Tree: + Join Operator + condition map: + Left Outer Join0 to 1 + condition expressions: + 0 {VALUE._col0} {VALUE._col1} + 1 {VALUE._col0} {VALUE._col1} + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 266 Data size: 1064 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: reflect2(_col1,'replaceAll','.*/','') (type: string), _col0 (type: int), reflect2(_col3,'replaceAll','.*/','') (type: string), _col2 (type: int) + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 266 Data size: 1064 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + Statistics: Num rows: 266 Data size: 1064 Basic stats: COMPLETE Column stats: NONE +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0,_col1,_col2,_col3 + columns.types string:int:string:int + escape.delim \ + hive.serialization.extend.nesting.levels true + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + + Stage: Stage-0 + Fetch Operator + limit: -1 + +PREHOOK: query: select reflect2(a.INPUT__FILE__NAME, "replaceAll", ".*/", ""),a.key,reflect2(b.INPUT__FILE__NAME, "replaceAll", ".*/", ""),b.key from + (select key,INPUT__FILE__NAME from srcbucket2 where INPUT__FILE__NAME rlike '.*/srcbucket2[03].txt' AND key < 100) a left outer join + (select key,INPUT__FILE__NAME from srcbucket2 where INPUT__FILE__NAME rlike '.*/srcbucket2[02].txt' AND key < 100) b on (a.key=b.key+1) +PREHOOK: type: QUERY +PREHOOK: Input: default@srcbucket2 +#### A masked pattern was here #### +POSTHOOK: query: select reflect2(a.INPUT__FILE__NAME, "replaceAll", ".*/", ""),a.key,reflect2(b.INPUT__FILE__NAME, "replaceAll", ".*/", ""),b.key from + (select key,INPUT__FILE__NAME from srcbucket2 where INPUT__FILE__NAME rlike '.*/srcbucket2[03].txt' AND key < 100) a left outer join + (select key,INPUT__FILE__NAME from srcbucket2 where INPUT__FILE__NAME rlike '.*/srcbucket2[02].txt' AND key < 100) b on (a.key=b.key+1) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@srcbucket2 +#### A masked pattern was here #### +srcbucket20.txt 0 NULL NULL +srcbucket20.txt 0 NULL NULL +srcbucket20.txt 0 NULL NULL +srcbucket20.txt 4 NULL NULL +srcbucket20.txt 8 NULL NULL +srcbucket23.txt 10 NULL NULL +srcbucket20.txt 11 NULL NULL +srcbucket20.txt 15 NULL NULL +srcbucket20.txt 15 NULL NULL +srcbucket23.txt 18 srcbucket22.txt 17 +srcbucket23.txt 18 srcbucket22.txt 17 +srcbucket20.txt 19 NULL NULL +srcbucket20.txt 26 NULL NULL +srcbucket20.txt 26 NULL NULL +srcbucket20.txt 33 NULL NULL +srcbucket20.txt 37 NULL NULL +srcbucket20.txt 37 NULL NULL +srcbucket23.txt 43 srcbucket22.txt 42 +srcbucket23.txt 43 srcbucket22.txt 42 +srcbucket20.txt 44 NULL NULL +srcbucket23.txt 47 NULL NULL +srcbucket20.txt 51 NULL NULL +srcbucket20.txt 51 NULL NULL +srcbucket23.txt 54 srcbucket22.txt 53 +srcbucket23.txt 58 srcbucket22.txt 57 +srcbucket23.txt 58 srcbucket22.txt 57 +srcbucket23.txt 65 srcbucket22.txt 64 +srcbucket20.txt 66 NULL NULL +srcbucket23.txt 69 NULL NULL +srcbucket23.txt 72 NULL NULL +srcbucket23.txt 72 NULL NULL +srcbucket23.txt 76 NULL NULL +srcbucket23.txt 76 NULL NULL +srcbucket20.txt 77 NULL NULL +srcbucket20.txt 80 NULL NULL +srcbucket23.txt 83 srcbucket22.txt 82 +srcbucket23.txt 83 srcbucket22.txt 82 +srcbucket20.txt 84 NULL NULL +srcbucket20.txt 84 NULL NULL +srcbucket23.txt 87 srcbucket22.txt 86 +srcbucket23.txt 90 NULL NULL +srcbucket23.txt 90 NULL NULL +srcbucket23.txt 90 NULL NULL +srcbucket20.txt 95 NULL NULL +srcbucket20.txt 95 NULL NULL +srcbucket23.txt 98 srcbucket22.txt 97 +srcbucket23.txt 98 srcbucket22.txt 97 +srcbucket23.txt 98 srcbucket22.txt 97 +srcbucket23.txt 98 srcbucket22.txt 97