diff --git common/src/java/org/apache/hadoop/hive/conf/HiveConf.java common/src/java/org/apache/hadoop/hive/conf/HiveConf.java index 4ba53106e2..90357c7902 100644 --- common/src/java/org/apache/hadoop/hive/conf/HiveConf.java +++ common/src/java/org/apache/hadoop/hive/conf/HiveConf.java @@ -2457,6 +2457,12 @@ private static void populateLlapDaemonVarsSet(Set llapDaemonVarsSetLocal "The comma-separated list of SerDe classes that are considered when enhancing table-properties \n" + "during logical optimization."), + HIVE_OPTIMIZE_SCAN_PROBEDECODE("hive.optimize.scan.probedecode", false, + "Whether to find suitable table scan operators that could reduce the number of decoded rows at runtime by probing extra available information. \n" + + "The probe side for the row-level filtering is generated either statically in the case of expressions or dynamically for joins" + + "e.g., use the cached MapJoin hashtable created on the small table side to filter out row columns that are not going " + + "to be used when reading the large table data. This will result less CPU cycles spent for decoding unused data."), + // CTE HIVE_CTE_MATERIALIZE_THRESHOLD("hive.optimize.cte.materialize.threshold", -1, "If the number of references to a CTE clause exceeds this threshold, Hive will materialize it\n" + diff --git itests/src/test/resources/testconfiguration.properties itests/src/test/resources/testconfiguration.properties index e99ce7babb..35a709be22 100644 --- itests/src/test/resources/testconfiguration.properties +++ itests/src/test/resources/testconfiguration.properties @@ -540,6 +540,8 @@ minillaplocal.query.files=\ cross_prod_3.q,\ cross_prod_4.q,\ dpp.q,\ + probedecode_mapjoin_stats.q,\ + probedecode_mapjoin_simple.q,\ dynamic_partition_pruning.q,\ dynamic_partition_join_noncbo.q,\ dynamic_semijoin_reduction.q,\ diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/TableScanOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/TableScanOperator.java index cd0637a971..ea66fc7dc9 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/TableScanOperator.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/TableScanOperator.java @@ -21,8 +21,10 @@ import java.io.Serializable; import java.util.ArrayList; import java.util.HashMap; +import java.util.HashSet; import java.util.List; import java.util.Map; +import java.util.Set; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; @@ -84,6 +86,50 @@ private String schemaEvolutionColumns; private String schemaEvolutionColumnsTypes; + private ProbeDecodeContext probeDecodeContextSet; + + /** + * Inner wrapper class for TS ProbeDecode optimization + */ + public static class ProbeDecodeContext { + + private final String mjSmallTableCacheKey; + private final String mjBigTableKeyColName; + private final byte mjSmallTablePos; + private final double keyRatio; + + public ProbeDecodeContext(String mjSmallTableCacheKey, byte mjSmallTablePos, String mjBigTableKeyColName, + double keyRatio) { + this.mjSmallTableCacheKey = mjSmallTableCacheKey; + this.mjSmallTablePos = mjSmallTablePos; + this.mjBigTableKeyColName = mjBigTableKeyColName; + this.keyRatio = keyRatio; + } + + public String getMjSmallTableCacheKey() { + return mjSmallTableCacheKey; + } + + public byte getMjSmallTablePos() { + return mjSmallTablePos; + } + + public String getMjBigTableKeyColName() { + return mjBigTableKeyColName; + } + + public double getKeyRatio() { + return keyRatio; + } + + @Override + public String toString() { + return "cacheKey:" + mjSmallTableCacheKey + ", bigKeyColName:" + mjBigTableKeyColName + + ", smallTablePos:" + mjSmallTablePos + ", keyRatio:" + keyRatio; + } + } + + public TableDesc getTableDescSkewJoin() { return tableDesc; } @@ -435,4 +481,12 @@ public VectorizationContext getOutputVectorizationContext() { return taskVectorizationContext; } + public ProbeDecodeContext getProbeDecodeContext() { + return probeDecodeContextSet; + } + + public void setProbeDecodeContext(ProbeDecodeContext probeDecodeContext) { + this.probeDecodeContextSet = probeDecodeContext; + } + } diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/SharedWorkOptimizer.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/SharedWorkOptimizer.java index 2f49985800..caed527d71 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/SharedWorkOptimizer.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/SharedWorkOptimizer.java @@ -487,6 +487,15 @@ private static boolean sharedWorkOptimization(ParseContext pctx, SharedWorkOptim } LOG.debug("Input operator removed: {}", op); } + + // A shared TSop across branches can not have probeContext that utilizes single branch info + // Filtered-out rows from one branch might be needed by another branch sharing a TSop + if (retainableTsOp.getProbeDecodeContext() != null) { + LOG.debug("Removing probeDecodeCntx for merged TS op {}", retainableTsOp); + retainableTsOp.setProbeDecodeContext(null); + retainableTsOp.getConf().setProbeDecodeContext(null); + } + // Then we merge the operators of the works we are going to merge optimizerCache.removeOpAndCombineWork(discardableTsOp, retainableTsOp); removedOps.add(discardableTsOp); diff --git ql/src/java/org/apache/hadoop/hive/ql/parse/GenTezUtils.java ql/src/java/org/apache/hadoop/hive/ql/parse/GenTezUtils.java index 4441ea3215..027f73f9d2 100644 --- ql/src/java/org/apache/hadoop/hive/ql/parse/GenTezUtils.java +++ ql/src/java/org/apache/hadoop/hive/ql/parse/GenTezUtils.java @@ -196,6 +196,11 @@ public MapWork createMapWork(GenTezProcContext context, Operator root, mapWork.setIncludedBuckets(ts.getConf().getIncludedBuckets()); } + if (ts.getProbeDecodeContext() != null) { + // TODO: some operators like VectorPTFEvaluator do not allow the use of Selected take this into account here? + mapWork.setProbeDecodeContext(ts.getProbeDecodeContext()); + } + // add new item to the tez work tezWork.add(mapWork); diff --git ql/src/java/org/apache/hadoop/hive/ql/parse/TezCompiler.java ql/src/java/org/apache/hadoop/hive/ql/parse/TezCompiler.java index caab0564f2..d07ff21b7f 100644 --- ql/src/java/org/apache/hadoop/hive/ql/parse/TezCompiler.java +++ ql/src/java/org/apache/hadoop/hive/ql/parse/TezCompiler.java @@ -125,6 +125,7 @@ import org.apache.hadoop.hive.ql.plan.ExprNodeFieldDesc; import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc; import org.apache.hadoop.hive.ql.plan.GroupByDesc; +import org.apache.hadoop.hive.ql.plan.MapJoinDesc; import org.apache.hadoop.hive.ql.plan.MapWork; import org.apache.hadoop.hive.ql.plan.MoveWork; import org.apache.hadoop.hive.ql.plan.OperatorDesc; @@ -138,6 +139,8 @@ import org.apache.hadoop.hive.ql.stats.OperatorStats; import org.apache.hadoop.hive.ql.stats.StatsUtils; import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFBloomFilter.GenericUDAFBloomFilterEvaluator; +import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; +import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -1310,7 +1313,8 @@ private static void runTopNKeyOptimization(OptimizeTezProcContext procCtx) private boolean findParallelSemiJoinBranch(Operator mapjoin, TableScanOperator bigTableTS, ParseContext parseContext, - Map semijoins) { + Map semijoins, + Map> probeDecodeMJoins) { boolean parallelEdges = false; for (Operator op : mapjoin.getParentOperators()) { @@ -1377,12 +1381,13 @@ private boolean findParallelSemiJoinBranch(Operator mapjoin, TableScanOperato parallelEdges = true; - if (sjInfo.getIsHint() || !sjInfo.getShouldRemove()) { - // Created by hint, skip it - continue; - } // Add the semijoin branch to the map semijoins.put(rs, ts); + // Add mapJoin branch to probeDecode table + if (!probeDecodeMJoins.containsKey(ts)){ + probeDecodeMJoins.put(ts, new ArrayList<>()); + } + probeDecodeMJoins.get(ts).add((MapJoinOperator) mapjoin); } } } @@ -1443,13 +1448,15 @@ private void removeSemiJoinEdgesForUnion(OptimizeTezProcContext procCtx) throws * The algorithm looks at all the mapjoins in the operator pipeline until * it hits RS Op and for each mapjoin examines if it has paralllel semijoin * edge or dynamic partition pruning. + * + * As an extension, the algorithm also looks for suitable table scan operators that + * could reduce the number of rows decoded at runtime using the information provided by + * the MapJoin operators of the branch when ProbeDecode feature is enabled. */ private void removeSemijoinsParallelToMapJoin(OptimizeTezProcContext procCtx) throws SemanticException { - if(!procCtx.conf.getBoolVar(ConfVars.HIVECONVERTJOIN) || - procCtx.conf.getBoolVar(ConfVars.TEZ_DYNAMIC_SEMIJOIN_REDUCTION_FOR_MAPJOIN)) { - // Not needed without semi-join reduction or mapjoins or when semijoins - // are enabled for parallel mapjoins. + if (!procCtx.conf.getBoolVar(ConfVars.HIVECONVERTJOIN)) { + // Not needed without mapjoin conversion return; } @@ -1458,6 +1465,7 @@ private void removeSemijoinsParallelToMapJoin(OptimizeTezProcContext procCtx) topOps.addAll(procCtx.parseContext.getTopOps().values()); Map semijoins = new HashMap<>(); + Map> probeDecodeMJoins = new HashMap<>(); for (Operator parent : topOps) { // A TS can have multiple branches due to DPP Or Semijoin Opt. // USe DFS to traverse all the branches until RS is hit. @@ -1473,7 +1481,7 @@ private void removeSemijoinsParallelToMapJoin(OptimizeTezProcContext procCtx) if (op instanceof MapJoinOperator) { // A candidate. if (!findParallelSemiJoinBranch(op, (TableScanOperator) parent, - procCtx.parseContext, semijoins)) { + procCtx.parseContext, semijoins, probeDecodeMJoins)) { // No parallel edge was found for the given mapjoin op, // no need to go down further, skip this TS operator pipeline. break; @@ -1482,18 +1490,131 @@ private void removeSemijoinsParallelToMapJoin(OptimizeTezProcContext procCtx) deque.addAll(op.getChildOperators()); } } + // No need to remove SJ branches when we have semi-join reduction or when semijoins are enabled for parallel mapjoins. + if (!procCtx.conf.getBoolVar(ConfVars.TEZ_DYNAMIC_SEMIJOIN_REDUCTION_FOR_MAPJOIN)) { + if (semijoins.size() > 0) { + for (Entry semiEntry : semijoins.entrySet()) { + SemiJoinBranchInfo sjInfo = procCtx.parseContext.getRsToSemiJoinBranchInfo().get(semiEntry.getKey()); + if (sjInfo.getIsHint() || !sjInfo.getShouldRemove()) { + // Created by hint, skip it + continue; + } + if (LOG.isDebugEnabled()) { + LOG.debug("Semijoin optimization with parallel edge to map join. Removing semijoin " + + OperatorUtils.getOpNamePretty(semiEntry.getKey()) + " - " + OperatorUtils.getOpNamePretty(semiEntry.getValue())); + } + GenTezUtils.removeBranch(semiEntry.getKey()); + GenTezUtils.removeSemiJoinOperator(procCtx.parseContext, semiEntry.getKey(), semiEntry.getValue()); + } + } + } + if (procCtx.conf.getBoolVar(ConfVars.HIVE_OPTIMIZE_SCAN_PROBEDECODE)) { + if (probeDecodeMJoins.size() > 0) { + // When multiple MJ, select one based on a policy + for (Map.Entry> probeTsMap : probeDecodeMJoins.entrySet()){ + TableScanOperator.ProbeDecodeContext tsCntx = null; + // Currently supporting: LowestRatio policy + // TODO: Add more policies and make the selection a conf property + tsCntx = selectLowestRatioProbeDecodeMapJoin(probeTsMap.getKey(), probeTsMap.getValue()); + LOG.debug("ProbeDecode MJ for TS {} with CacheKey {} MJ Pos {} ColName {} with Ratio {}", + probeTsMap.getKey().getName(), tsCntx.getMjSmallTableCacheKey(), tsCntx.getMjSmallTablePos(), + tsCntx.getMjBigTableKeyColName(), tsCntx.getKeyRatio()); + probeTsMap.getKey().setProbeDecodeContext(tsCntx); + probeTsMap.getKey().getConf().setProbeDecodeContext(tsCntx); + } + } + } + } - if (semijoins.size() > 0) { - for (ReduceSinkOperator rs : semijoins.keySet()) { - if (LOG.isDebugEnabled()) { - LOG.debug("Semijoin optimization with parallel edge to map join. Removing semijoin " - + OperatorUtils.getOpNamePretty(rs) + " - " + OperatorUtils.getOpNamePretty(semijoins.get(rs))); + private static TableScanOperator.ProbeDecodeContext selectLowestRatioProbeDecodeMapJoin(TableScanOperator tsOp, + List mjOps){ + MapJoinOperator selectedMJOp = null; + double selectedMJOpRatio = 0; + for (MapJoinOperator currMJOp : mjOps) { + if (!isValidProbeDecodeMapJoin(currMJOp)) { + continue; + } + // At this point we know it is a single Key MapJoin + if (selectedMJOp == null) { + // Set the first valid MJ + selectedMJOp = currMJOp; + selectedMJOpRatio = getProbeDecodeNDVRatio(tsOp, currMJOp); + LOG.debug("ProbeDecode MJ {} with Ratio {}", selectedMJOp, selectedMJOpRatio); + } else { + double currMJRatio = getProbeDecodeNDVRatio(tsOp, currMJOp); + if (currMJRatio < selectedMJOpRatio){ + LOG.debug("ProbeDecode MJ {} Ratio {} is lower than existing MJ {} with Ratio {}", + currMJOp, currMJRatio, selectedMJOp, selectedMJOpRatio); + selectedMJOp = currMJOp; + selectedMJOpRatio = currMJRatio; } - GenTezUtils.removeBranch(rs); - GenTezUtils.removeSemiJoinOperator(procCtx.parseContext, rs, - semijoins.get(rs)); } } + + TableScanOperator.ProbeDecodeContext tsProbeDecodeCtx = null; + // If there a valid MJ to be used for TS probeDecode make sure the MJ cache key is generated and + // then propagate the new ProbeDecodeContext (to be used by LLap IO when executing the TSop) + if (selectedMJOp != null) { + String mjCacheKey = selectedMJOp.getConf().getCacheKey(); + if (mjCacheKey == null) { + // Generate cache key if it has not been yet generated + mjCacheKey = MapJoinDesc.generateCacheKey(selectedMJOp.getOperatorId()); + // Set in the conf of the map join operator + selectedMJOp.getConf().setCacheKey(mjCacheKey); + } + + byte posBigTable = (byte) selectedMJOp.getConf().getPosBigTable(); + Byte[] order = selectedMJOp.getConf().getTagOrder(); + Byte mjSmallTablePos = (order[0] == posBigTable ? order[1] : order[0]); + + List keyDesc = selectedMJOp.getConf().getKeys().get(posBigTable); + ExprNodeColumnDesc keyCol = (ExprNodeColumnDesc) keyDesc.get(0); + + tsProbeDecodeCtx = new TableScanOperator.ProbeDecodeContext(mjCacheKey, mjSmallTablePos, + keyCol.getColumn(), selectedMJOpRatio); + } + return tsProbeDecodeCtx; + } + + // Return the ratio of: (distinct) JOIN_probe_key_column_rows / (distinct) JOIN_TS_target_column_rows + private static double getProbeDecodeNDVRatio(TableScanOperator tsOp, MapJoinOperator mjOp) { + long mjKeyCardinality = mjOp.getStatistics().getNumRows(); + long tsKeyCardinality = tsOp.getStatistics().getNumRows(); + + byte posBigTable = (byte) mjOp.getConf().getPosBigTable(); + + Byte[] order = mjOp.getConf().getTagOrder(); + Byte mjSmallTablePos = (order[0] == posBigTable ? order[1] : order[0]); + Byte mjBigTablePos = (order[0] == posBigTable ? order[0] : order[1]); + + // Single Key MJ at this point + List tsKeyDesc = mjOp.getConf().getKeys().get(mjBigTablePos); + ExprNodeColumnDesc tsKeyCol = (ExprNodeColumnDesc) tsKeyDesc.get(0); + + List mjKeyDesc = mjOp.getConf().getKeys().get(mjSmallTablePos); + ExprNodeColumnDesc mjKeyCol = (ExprNodeColumnDesc) mjKeyDesc.get(0); + + ColStatistics mjStats = mjOp.getStatistics().getColumnStatisticsFromColName(mjKeyCol.getColumn()); + ColStatistics tsStats = tsOp.getStatistics().getColumnStatisticsFromColName(tsKeyCol.getColumn()); + + if (canUseNDV(mjStats)) { + mjKeyCardinality = mjStats.getCountDistint(); + } + if (canUseNDV(tsStats)) { + tsKeyCardinality = tsStats.getCountDistint(); + } + return mjKeyCardinality / (double) tsKeyCardinality; + } + + // Valid MapJoin with a single Key of Number type (Long/Int/Short) + private static boolean isValidProbeDecodeMapJoin(MapJoinOperator mapJoinOp) { + Map> keyExprs = mapJoinOp.getConf().getKeys(); + List bigTableKeyExprs = keyExprs.get( (byte) mapJoinOp.getConf().getPosBigTable()); + return (bigTableKeyExprs.size() == 1) + && !(((PrimitiveTypeInfo) bigTableKeyExprs.get(0).getTypeInfo()).getPrimitiveCategory(). + equals(PrimitiveObjectInspector.PrimitiveCategory.STRING) || + ((PrimitiveTypeInfo) bigTableKeyExprs.get(0).getTypeInfo()).getPrimitiveCategory(). + equals(PrimitiveObjectInspector.PrimitiveCategory.BYTE)); } private static boolean canUseNDV(ColStatistics colStats) { @@ -1717,6 +1838,7 @@ private static void sortSemijoinFilters(OptimizeTezProcContext procCtx, private void removeSemijoinOptimizationByBenefit(OptimizeTezProcContext procCtx) throws SemanticException { + Map map = procCtx.parseContext.getRsToSemiJoinBranchInfo(); if (map.isEmpty()) { // Nothing to do diff --git ql/src/java/org/apache/hadoop/hive/ql/plan/MapWork.java ql/src/java/org/apache/hadoop/hive/ql/plan/MapWork.java index ef7e956fc7..045e06b421 100644 --- ql/src/java/org/apache/hadoop/hive/ql/plan/MapWork.java +++ ql/src/java/org/apache/hadoop/hive/ql/plan/MapWork.java @@ -19,6 +19,7 @@ package org.apache.hadoop.hive.ql.plan; import org.apache.hadoop.hive.common.StringInternUtils; +import org.apache.hadoop.hive.ql.exec.TableScanOperator.ProbeDecodeContext; import org.apache.hadoop.hive.ql.exec.Utilities; import java.util.ArrayList; @@ -175,6 +176,8 @@ private boolean isMergeFromResolver; + private ProbeDecodeContext probeDecodeContext = null; + public MapWork() {} public MapWork(String name) { @@ -846,6 +849,14 @@ public void setVectorizationEnabledConditionsNotMet(Collection vectoriza return vectorizationEnabledConditionsNotMet; } + public ProbeDecodeContext getProbeDecodeContext() { + return probeDecodeContext; + } + + public void setProbeDecodeContext(ProbeDecodeContext probeDecodeContext) { + this.probeDecodeContext = probeDecodeContext; + } + public class MapExplainVectorization extends BaseExplainVectorization { private final MapWork mapWork; diff --git ql/src/java/org/apache/hadoop/hive/ql/plan/TableScanDesc.java ql/src/java/org/apache/hadoop/hive/ql/plan/TableScanDesc.java index d2e22c8388..53a6036298 100644 --- ql/src/java/org/apache/hadoop/hive/ql/plan/TableScanDesc.java +++ ql/src/java/org/apache/hadoop/hive/ql/plan/TableScanDesc.java @@ -19,6 +19,7 @@ package org.apache.hadoop.hive.ql.plan; import org.apache.hadoop.hive.common.type.DataTypePhysicalVariation; +import org.apache.hadoop.hive.ql.exec.TableScanOperator; import org.apache.hadoop.hive.ql.io.AcidUtils; import org.apache.hadoop.hive.ql.metadata.HiveStorageHandler; import org.apache.hadoop.hive.ql.metadata.Table; @@ -115,6 +116,8 @@ private AcidUtils.AcidOperationalProperties acidOperationalProperties = null; + private TableScanOperator.ProbeDecodeContext probeDecodeContext = null; + private transient TableSample tableSample; private transient Table tableMetadata; @@ -133,10 +136,15 @@ public TableScanDesc(Table tblMetadata) { } public TableScanDesc(final String alias, Table tblMetadata) { - this(alias, null, tblMetadata); + this(alias, null, tblMetadata, null); } public TableScanDesc(final String alias, List vcs, Table tblMetadata) { + this(alias, vcs, tblMetadata, null); + } + + public TableScanDesc(final String alias, List vcs, Table tblMetadata, + TableScanOperator.ProbeDecodeContext probeDecodeContext) { this.alias = alias; this.virtualCols = vcs; this.tableMetadata = tblMetadata; @@ -149,12 +157,13 @@ public TableScanDesc(final String alias, List vcs, Table tblMetad if (isTranscationalTable) { acidOperationalProperties = AcidUtils.getAcidOperationalProperties(this.tableMetadata); } + this.probeDecodeContext = probeDecodeContext; } @Override public Object clone() { List vcs = new ArrayList(getVirtualCols()); - return new TableScanDesc(getAlias(), vcs, this.tableMetadata); + return new TableScanDesc(getAlias(), vcs, this.tableMetadata, this.probeDecodeContext); } @Explain(displayName = "alias") @@ -238,6 +247,18 @@ public void setFilterExpr(ExprNodeGenericFuncDesc filterExpr) { this.filterExpr = filterExpr; } + @Explain(displayName = "probeDecodeDetails", explainLevels = { Level.DEFAULT, Level.EXTENDED }) + public String getProbeDecodeString() { + if (probeDecodeContext == null) { + return null; + } + return probeDecodeContext.toString(); + } + + public void setProbeDecodeContext(TableScanOperator.ProbeDecodeContext probeDecodeContext) { + this.probeDecodeContext = probeDecodeContext; + } + public Serializable getFilterObject() { return filterObject; } diff --git ql/src/test/queries/clientpositive/probedecode_mapjoin_simple.q ql/src/test/queries/clientpositive/probedecode_mapjoin_simple.q new file mode 100644 index 0000000000..90550e23e0 --- /dev/null +++ ql/src/test/queries/clientpositive/probedecode_mapjoin_simple.q @@ -0,0 +1,31 @@ +set hive.stats.column.autogather=false; +set hive.mapred.mode=nonstrict; +set hive.explain.user=false; +SET hive.auto.convert.join=true; +SET hive.auto.convert.join.noconditionaltask=true; +SET hive.auto.convert.join.noconditionaltask.size=1000000000; +SET hive.vectorized.execution.enabled=true; +set hive.vectorized.execution.mapjoin.native.fast.hashtable.enabled=true; +set hive.fetch.task.conversion=none; +SET mapred.min.split.size=1000; +SET mapred.max.split.size=5000; + +CREATE TABLE item_dim (key1 int, name string) stored as ORC; +CREATE TABLE orders_fact (nokey int, key2 int, dt timestamp) stored as ORC; + +INSERT INTO item_dim values(101, "Item 101"); +INSERT INTO item_dim values(102, "Item 102"); + +INSERT INTO orders_fact values(12345, 101, '2001-01-30 00:00:00'); +INSERT INTO orders_fact values(23456, 104, '2002-02-30 00:00:00'); +INSERT INTO orders_fact values(34567, 108, '2003-03-30 00:00:00'); +INSERT INTO orders_fact values(45678, 102, '2004-04-30 00:00:00'); +INSERT INTO orders_fact values(56789, 109, '2005-05-30 00:00:00'); +INSERT INTO orders_fact values(67891, 110, '2006-06-30 00:00:00'); + +SET hive.optimize.scan.probedecode=true; + +EXPLAIN VECTORIZATION DETAIL select key1, key2, name, dt from orders_fact join item_dim on (orders_fact.key2 = item_dim.key1); + +-- two keys match, the remaining rows can be skipped +select key1, key2, name, dt from orders_fact join item_dim on (orders_fact.key2 = item_dim.key1); diff --git ql/src/test/queries/clientpositive/probedecode_mapjoin_stats.q ql/src/test/queries/clientpositive/probedecode_mapjoin_stats.q new file mode 100644 index 0000000000..0dde302695 --- /dev/null +++ ql/src/test/queries/clientpositive/probedecode_mapjoin_stats.q @@ -0,0 +1,55 @@ +SET hive.auto.convert.join=true; +SET hive.auto.convert.join.noconditionaltask=true; +SET hive.auto.convert.join.noconditionaltask.size=1000000000; + +SET hive.vectorized.execution.enabled=true; +set hive.vectorized.execution.mapjoin.native.fast.hashtable.enabled=true; + +set hive.stats.dbclass=fs; +set hive.stats.fetch.column.stats=true; +set datanucleus.cache.collections=false; + +set hive.merge.mapfiles=false; +set hive.merge.mapredfiles=false; + +set hive.stats.autogather=true; +set hive.stats.column.autogather=true; +set hive.compute.query.using.stats=true; +set hive.mapred.mode=nonstrict; +set hive.explain.user=false; + +set hive.fetch.task.conversion=none; +set hive.support.concurrency=true; +set hive.txn.manager=org.apache.hadoop.hive.ql.lockmgr.DbTxnManager; +set hive.query.results.cache.enabled=false; + +SET mapred.min.split.size=1000; +SET mapred.max.split.size=5000; + +CREATE TABLE item_dim (key1 int, name string) stored as ORC; +CREATE TABLE orders_fact (key3 int, key2 int, dt timestamp) stored as ORC; +CREATE TABLE seller_dim (key4 int, sellername string) stored as ORC; + +INSERT INTO item_dim values(101, "Item 101"); +INSERT INTO item_dim values(102, "Item 102"); +INSERT INTO item_dim values(103, "Item 103"); +INSERT INTO item_dim values(104, "Item 104"); +INSERT INTO item_dim values(105, "Item 105"); + +INSERT INTO orders_fact values(12345, 101, '2001-01-30 00:00:00'); +INSERT INTO orders_fact values(23456, 104, '2002-02-30 00:00:00'); +INSERT INTO orders_fact values(34567, 108, '2003-03-30 00:00:00'); +INSERT INTO orders_fact values(45678, 102, '2004-04-30 00:00:00'); +INSERT INTO orders_fact values(56789, 109, '2005-05-30 00:00:00'); +INSERT INTO orders_fact values(67891, 110, '2006-06-30 00:00:00'); + +-- Less cardinality than item_dim +INSERT INTO seller_dim values(12345, "Seller 1"); +INSERT INTO seller_dim values(23456, "Seller 2"); + +SET hive.optimize.scan.probedecode=true; + +EXPLAIN VECTORIZATION DETAIL select key1, key2, key3, key4, sellername, name, dt from orders_fact join item_dim on (orders_fact.key2 = item_dim.key1) join seller_dim on (orders_fact.key3 = seller_dim.key4); + +-- two keys match, the remaining rows can be skipped +select key1, key2, key3, key4, sellername, name, dt from orders_fact join item_dim on (orders_fact.key2 = item_dim.key1) join seller_dim on (orders_fact.key3 = seller_dim.key4); diff --git ql/src/test/results/clientpositive/llap/probedecode_mapjoin_simple.q.out ql/src/test/results/clientpositive/llap/probedecode_mapjoin_simple.q.out new file mode 100644 index 0000000000..cd4dafb44e --- /dev/null +++ ql/src/test/results/clientpositive/llap/probedecode_mapjoin_simple.q.out @@ -0,0 +1,282 @@ +PREHOOK: query: CREATE TABLE item_dim (key1 int, name string) stored as ORC +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@item_dim +POSTHOOK: query: CREATE TABLE item_dim (key1 int, name string) stored as ORC +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@item_dim +PREHOOK: query: CREATE TABLE orders_fact (nokey int, key2 int, dt timestamp) stored as ORC +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@orders_fact +POSTHOOK: query: CREATE TABLE orders_fact (nokey int, key2 int, dt timestamp) stored as ORC +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@orders_fact +PREHOOK: query: INSERT INTO item_dim values(101, "Item 101") +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@item_dim +POSTHOOK: query: INSERT INTO item_dim values(101, "Item 101") +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@item_dim +POSTHOOK: Lineage: item_dim.key1 SCRIPT [] +POSTHOOK: Lineage: item_dim.name SCRIPT [] +PREHOOK: query: INSERT INTO item_dim values(102, "Item 102") +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@item_dim +POSTHOOK: query: INSERT INTO item_dim values(102, "Item 102") +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@item_dim +POSTHOOK: Lineage: item_dim.key1 SCRIPT [] +POSTHOOK: Lineage: item_dim.name SCRIPT [] +PREHOOK: query: INSERT INTO orders_fact values(12345, 101, '2001-01-30 00:00:00') +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@orders_fact +POSTHOOK: query: INSERT INTO orders_fact values(12345, 101, '2001-01-30 00:00:00') +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@orders_fact +POSTHOOK: Lineage: orders_fact.dt SCRIPT [] +POSTHOOK: Lineage: orders_fact.key2 SCRIPT [] +POSTHOOK: Lineage: orders_fact.nokey SCRIPT [] +PREHOOK: query: INSERT INTO orders_fact values(23456, 104, '2002-02-30 00:00:00') +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@orders_fact +POSTHOOK: query: INSERT INTO orders_fact values(23456, 104, '2002-02-30 00:00:00') +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@orders_fact +POSTHOOK: Lineage: orders_fact.dt SCRIPT [] +POSTHOOK: Lineage: orders_fact.key2 SCRIPT [] +POSTHOOK: Lineage: orders_fact.nokey SCRIPT [] +PREHOOK: query: INSERT INTO orders_fact values(34567, 108, '2003-03-30 00:00:00') +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@orders_fact +POSTHOOK: query: INSERT INTO orders_fact values(34567, 108, '2003-03-30 00:00:00') +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@orders_fact +POSTHOOK: Lineage: orders_fact.dt SCRIPT [] +POSTHOOK: Lineage: orders_fact.key2 SCRIPT [] +POSTHOOK: Lineage: orders_fact.nokey SCRIPT [] +PREHOOK: query: INSERT INTO orders_fact values(45678, 102, '2004-04-30 00:00:00') +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@orders_fact +POSTHOOK: query: INSERT INTO orders_fact values(45678, 102, '2004-04-30 00:00:00') +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@orders_fact +POSTHOOK: Lineage: orders_fact.dt SCRIPT [] +POSTHOOK: Lineage: orders_fact.key2 SCRIPT [] +POSTHOOK: Lineage: orders_fact.nokey SCRIPT [] +PREHOOK: query: INSERT INTO orders_fact values(56789, 109, '2005-05-30 00:00:00') +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@orders_fact +POSTHOOK: query: INSERT INTO orders_fact values(56789, 109, '2005-05-30 00:00:00') +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@orders_fact +POSTHOOK: Lineage: orders_fact.dt SCRIPT [] +POSTHOOK: Lineage: orders_fact.key2 SCRIPT [] +POSTHOOK: Lineage: orders_fact.nokey SCRIPT [] +PREHOOK: query: INSERT INTO orders_fact values(67891, 110, '2006-06-30 00:00:00') +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@orders_fact +POSTHOOK: query: INSERT INTO orders_fact values(67891, 110, '2006-06-30 00:00:00') +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@orders_fact +POSTHOOK: Lineage: orders_fact.dt SCRIPT [] +POSTHOOK: Lineage: orders_fact.key2 SCRIPT [] +POSTHOOK: Lineage: orders_fact.nokey SCRIPT [] +PREHOOK: query: EXPLAIN VECTORIZATION DETAIL select key1, key2, name, dt from orders_fact join item_dim on (orders_fact.key2 = item_dim.key1) +PREHOOK: type: QUERY +PREHOOK: Input: default@item_dim +PREHOOK: Input: default@orders_fact +#### A masked pattern was here #### +POSTHOOK: query: EXPLAIN VECTORIZATION DETAIL select key1, key2, name, dt from orders_fact join item_dim on (orders_fact.key2 = item_dim.key1) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@item_dim +POSTHOOK: Input: default@orders_fact +#### A masked pattern was here #### +PLAN VECTORIZATION: + enabled: true + enabledConditionsMet: [hive.vectorized.execution.enabled IS true] + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Map 1 <- Map 2 (BROADCAST_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: orders_fact + filterExpr: key2 is not null (type: boolean) + probeDecodeDetails: cacheKey:HASH_MAP_MAPJOIN_25_container, bigKeyColName:_col0, smallTablePos:1, keyRatio:1.0 + Statistics: Num rows: 6 Data size: 264 Basic stats: COMPLETE Column stats: NONE + TableScan Vectorization: + native: true + vectorizationSchemaColumns: [0:nokey:int, 1:key2:int, 2:dt:timestamp, 3:ROW__ID:struct] + Filter Operator + Filter Vectorization: + className: VectorFilterOperator + native: true + predicateExpression: SelectColumnIsNotNull(col 1:int) + predicate: key2 is not null (type: boolean) + Statistics: Num rows: 6 Data size: 264 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key2 (type: int), dt (type: timestamp) + outputColumnNames: _col0, _col1 + Select Vectorization: + className: VectorSelectOperator + native: true + projectedOutputColumnNums: [1, 2] + Statistics: Num rows: 6 Data size: 264 Basic stats: COMPLETE Column stats: NONE + Map Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 _col0 (type: int) + 1 _col0 (type: int) + Map Join Vectorization: + bigTableKeyColumns: 1:int + bigTableRetainColumnNums: [1, 2] + bigTableValueColumns: 1:int, 2:timestamp + className: VectorMapJoinInnerLongOperator + native: true + nativeConditionsMet: hive.mapjoin.optimized.hashtable IS true, hive.vectorized.execution.mapjoin.native.enabled IS true, hive.execution.engine tez IN [tez, spark] IS true, One MapJoin Condition IS true, No nullsafe IS true, Small table vectorizes IS true, Fast Hash Table and No Hybrid Hash Join IS true + nonOuterSmallTableKeyMapping: [] + projectedOutput: 1:int, 2:timestamp, 1:int, 4:string + smallTableValueMapping: 4:string + hashTableImplementationType: FAST + outputColumnNames: _col0, _col1, _col2, _col3 + input vertices: + 1 Map 2 + Statistics: Num rows: 6 Data size: 290 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col2 (type: int), _col0 (type: int), _col3 (type: string), _col1 (type: timestamp) + outputColumnNames: _col0, _col1, _col2, _col3 + Select Vectorization: + className: VectorSelectOperator + native: true + projectedOutputColumnNums: [1, 1, 4, 2] + Statistics: Num rows: 6 Data size: 290 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + File Sink Vectorization: + className: VectorFileSinkOperator + native: false + Statistics: Num rows: 6 Data size: 290 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Execution mode: vectorized, llap + LLAP IO: all inputs + Map Vectorization: + enabled: true + enabledConditionsMet: hive.vectorized.use.vectorized.input.format IS true + inputFormatFeatureSupport: [DECIMAL_64] + featureSupportInUse: [DECIMAL_64] + inputFileFormats: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + allNative: false + usesVectorUDFAdaptor: false + vectorized: true + rowBatchContext: + dataColumnCount: 3 + includeColumns: [1, 2] + dataColumns: nokey:int, key2:int, dt:timestamp + partitionColumnCount: 0 + scratchColumnTypeNames: [string] + Map 2 + Map Operator Tree: + TableScan + alias: item_dim + filterExpr: key1 is not null (type: boolean) + Statistics: Num rows: 2 Data size: 376 Basic stats: COMPLETE Column stats: NONE + TableScan Vectorization: + native: true + vectorizationSchemaColumns: [0:key1:int, 1:name:string, 2:ROW__ID:struct] + Filter Operator + Filter Vectorization: + className: VectorFilterOperator + native: true + predicateExpression: SelectColumnIsNotNull(col 0:int) + predicate: key1 is not null (type: boolean) + Statistics: Num rows: 2 Data size: 376 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key1 (type: int), name (type: string) + outputColumnNames: _col0, _col1 + Select Vectorization: + className: VectorSelectOperator + native: true + projectedOutputColumnNums: [0, 1] + Statistics: Num rows: 2 Data size: 376 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: int) + null sort order: z + sort order: + + Map-reduce partition columns: _col0 (type: int) + Reduce Sink Vectorization: + className: VectorReduceSinkLongOperator + keyColumns: 0:int + native: true + nativeConditionsMet: hive.vectorized.execution.reducesink.new.enabled IS true, hive.execution.engine tez IN [tez, spark] IS true, No PTF TopN IS true, No DISTINCT columns IS true, BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true + valueColumns: 1:string + Statistics: Num rows: 2 Data size: 376 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: string) + Execution mode: vectorized, llap + LLAP IO: all inputs + Map Vectorization: + enabled: true + enabledConditionsMet: hive.vectorized.use.vectorized.input.format IS true + inputFormatFeatureSupport: [DECIMAL_64] + featureSupportInUse: [DECIMAL_64] + inputFileFormats: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + allNative: true + usesVectorUDFAdaptor: false + vectorized: true + rowBatchContext: + dataColumnCount: 2 + includeColumns: [0, 1] + dataColumns: key1:int, name:string + partitionColumnCount: 0 + scratchColumnTypeNames: [] + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select key1, key2, name, dt from orders_fact join item_dim on (orders_fact.key2 = item_dim.key1) +PREHOOK: type: QUERY +PREHOOK: Input: default@item_dim +PREHOOK: Input: default@orders_fact +#### A masked pattern was here #### +POSTHOOK: query: select key1, key2, name, dt from orders_fact join item_dim on (orders_fact.key2 = item_dim.key1) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@item_dim +POSTHOOK: Input: default@orders_fact +#### A masked pattern was here #### +102 102 Item 102 2004-04-30 00:00:00 +101 101 Item 101 2001-01-30 00:00:00 diff --git ql/src/test/results/clientpositive/llap/probedecode_mapjoin_stats.q.out ql/src/test/results/clientpositive/llap/probedecode_mapjoin_stats.q.out new file mode 100644 index 0000000000..5fa8ea270f --- /dev/null +++ ql/src/test/results/clientpositive/llap/probedecode_mapjoin_stats.q.out @@ -0,0 +1,419 @@ +PREHOOK: query: CREATE TABLE item_dim (key1 int, name string) stored as ORC +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@item_dim +POSTHOOK: query: CREATE TABLE item_dim (key1 int, name string) stored as ORC +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@item_dim +PREHOOK: query: CREATE TABLE orders_fact (key3 int, key2 int, dt timestamp) stored as ORC +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@orders_fact +POSTHOOK: query: CREATE TABLE orders_fact (key3 int, key2 int, dt timestamp) stored as ORC +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@orders_fact +PREHOOK: query: CREATE TABLE seller_dim (key4 int, sellername string) stored as ORC +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@seller_dim +POSTHOOK: query: CREATE TABLE seller_dim (key4 int, sellername string) stored as ORC +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@seller_dim +PREHOOK: query: INSERT INTO item_dim values(101, "Item 101") +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@item_dim +POSTHOOK: query: INSERT INTO item_dim values(101, "Item 101") +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@item_dim +POSTHOOK: Lineage: item_dim.key1 SCRIPT [] +POSTHOOK: Lineage: item_dim.name SCRIPT [] +PREHOOK: query: INSERT INTO item_dim values(102, "Item 102") +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@item_dim +POSTHOOK: query: INSERT INTO item_dim values(102, "Item 102") +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@item_dim +POSTHOOK: Lineage: item_dim.key1 SCRIPT [] +POSTHOOK: Lineage: item_dim.name SCRIPT [] +PREHOOK: query: INSERT INTO item_dim values(103, "Item 103") +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@item_dim +POSTHOOK: query: INSERT INTO item_dim values(103, "Item 103") +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@item_dim +POSTHOOK: Lineage: item_dim.key1 SCRIPT [] +POSTHOOK: Lineage: item_dim.name SCRIPT [] +PREHOOK: query: INSERT INTO item_dim values(104, "Item 104") +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@item_dim +POSTHOOK: query: INSERT INTO item_dim values(104, "Item 104") +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@item_dim +POSTHOOK: Lineage: item_dim.key1 SCRIPT [] +POSTHOOK: Lineage: item_dim.name SCRIPT [] +PREHOOK: query: INSERT INTO item_dim values(105, "Item 105") +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@item_dim +POSTHOOK: query: INSERT INTO item_dim values(105, "Item 105") +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@item_dim +POSTHOOK: Lineage: item_dim.key1 SCRIPT [] +POSTHOOK: Lineage: item_dim.name SCRIPT [] +PREHOOK: query: INSERT INTO orders_fact values(12345, 101, '2001-01-30 00:00:00') +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@orders_fact +POSTHOOK: query: INSERT INTO orders_fact values(12345, 101, '2001-01-30 00:00:00') +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@orders_fact +POSTHOOK: Lineage: orders_fact.dt SCRIPT [] +POSTHOOK: Lineage: orders_fact.key2 SCRIPT [] +POSTHOOK: Lineage: orders_fact.key3 SCRIPT [] +PREHOOK: query: INSERT INTO orders_fact values(23456, 104, '2002-02-30 00:00:00') +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@orders_fact +POSTHOOK: query: INSERT INTO orders_fact values(23456, 104, '2002-02-30 00:00:00') +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@orders_fact +POSTHOOK: Lineage: orders_fact.dt SCRIPT [] +POSTHOOK: Lineage: orders_fact.key2 SCRIPT [] +POSTHOOK: Lineage: orders_fact.key3 SCRIPT [] +PREHOOK: query: INSERT INTO orders_fact values(34567, 108, '2003-03-30 00:00:00') +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@orders_fact +POSTHOOK: query: INSERT INTO orders_fact values(34567, 108, '2003-03-30 00:00:00') +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@orders_fact +POSTHOOK: Lineage: orders_fact.dt SCRIPT [] +POSTHOOK: Lineage: orders_fact.key2 SCRIPT [] +POSTHOOK: Lineage: orders_fact.key3 SCRIPT [] +PREHOOK: query: INSERT INTO orders_fact values(45678, 102, '2004-04-30 00:00:00') +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@orders_fact +POSTHOOK: query: INSERT INTO orders_fact values(45678, 102, '2004-04-30 00:00:00') +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@orders_fact +POSTHOOK: Lineage: orders_fact.dt SCRIPT [] +POSTHOOK: Lineage: orders_fact.key2 SCRIPT [] +POSTHOOK: Lineage: orders_fact.key3 SCRIPT [] +PREHOOK: query: INSERT INTO orders_fact values(56789, 109, '2005-05-30 00:00:00') +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@orders_fact +POSTHOOK: query: INSERT INTO orders_fact values(56789, 109, '2005-05-30 00:00:00') +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@orders_fact +POSTHOOK: Lineage: orders_fact.dt SCRIPT [] +POSTHOOK: Lineage: orders_fact.key2 SCRIPT [] +POSTHOOK: Lineage: orders_fact.key3 SCRIPT [] +PREHOOK: query: INSERT INTO orders_fact values(67891, 110, '2006-06-30 00:00:00') +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@orders_fact +POSTHOOK: query: INSERT INTO orders_fact values(67891, 110, '2006-06-30 00:00:00') +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@orders_fact +POSTHOOK: Lineage: orders_fact.dt SCRIPT [] +POSTHOOK: Lineage: orders_fact.key2 SCRIPT [] +POSTHOOK: Lineage: orders_fact.key3 SCRIPT [] +PREHOOK: query: INSERT INTO seller_dim values(12345, "Seller 1") +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@seller_dim +POSTHOOK: query: INSERT INTO seller_dim values(12345, "Seller 1") +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@seller_dim +POSTHOOK: Lineage: seller_dim.key4 SCRIPT [] +POSTHOOK: Lineage: seller_dim.sellername SCRIPT [] +PREHOOK: query: INSERT INTO seller_dim values(23456, "Seller 2") +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@seller_dim +POSTHOOK: query: INSERT INTO seller_dim values(23456, "Seller 2") +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@seller_dim +POSTHOOK: Lineage: seller_dim.key4 SCRIPT [] +POSTHOOK: Lineage: seller_dim.sellername SCRIPT [] +PREHOOK: query: EXPLAIN VECTORIZATION DETAIL select key1, key2, key3, key4, sellername, name, dt from orders_fact join item_dim on (orders_fact.key2 = item_dim.key1) join seller_dim on (orders_fact.key3 = seller_dim.key4) +PREHOOK: type: QUERY +PREHOOK: Input: default@item_dim +PREHOOK: Input: default@orders_fact +PREHOOK: Input: default@seller_dim +#### A masked pattern was here #### +POSTHOOK: query: EXPLAIN VECTORIZATION DETAIL select key1, key2, key3, key4, sellername, name, dt from orders_fact join item_dim on (orders_fact.key2 = item_dim.key1) join seller_dim on (orders_fact.key3 = seller_dim.key4) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@item_dim +POSTHOOK: Input: default@orders_fact +POSTHOOK: Input: default@seller_dim +#### A masked pattern was here #### +PLAN VECTORIZATION: + enabled: true + enabledConditionsMet: [hive.vectorized.execution.enabled IS true] + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Map 1 <- Map 2 (BROADCAST_EDGE), Map 3 (BROADCAST_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: orders_fact + filterExpr: (key2 is not null and key3 is not null) (type: boolean) + probeDecodeDetails: cacheKey:HASH_MAP_MAPJOIN_46_container, bigKeyColName:_col0, smallTablePos:1, keyRatio:0.16666666666666666 + Statistics: Num rows: 6 Data size: 288 Basic stats: COMPLETE Column stats: COMPLETE + TableScan Vectorization: + native: true + vectorizationSchemaColumns: [0:key3:int, 1:key2:int, 2:dt:timestamp, 3:ROW__ID:struct] + Filter Operator + Filter Vectorization: + className: VectorFilterOperator + native: true + predicateExpression: FilterExprAndExpr(children: SelectColumnIsNotNull(col 1:int), SelectColumnIsNotNull(col 0:int)) + predicate: (key2 is not null and key3 is not null) (type: boolean) + Statistics: Num rows: 6 Data size: 288 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: key3 (type: int), key2 (type: int), dt (type: timestamp) + outputColumnNames: _col0, _col1, _col2 + Select Vectorization: + className: VectorSelectOperator + native: true + projectedOutputColumnNums: [0, 1, 2] + Statistics: Num rows: 6 Data size: 288 Basic stats: COMPLETE Column stats: COMPLETE + Map Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 _col1 (type: int) + 1 _col0 (type: int) + Map Join Vectorization: + bigTableKeyColumns: 1:int + bigTableRetainColumnNums: [0, 1, 2] + bigTableValueColumns: 0:int, 1:int, 2:timestamp + className: VectorMapJoinInnerLongOperator + native: true + nativeConditionsMet: hive.mapjoin.optimized.hashtable IS true, hive.vectorized.execution.mapjoin.native.enabled IS true, hive.execution.engine tez IN [tez, spark] IS true, One MapJoin Condition IS true, No nullsafe IS true, Small table vectorizes IS true, Fast Hash Table and No Hybrid Hash Join IS true + nonOuterSmallTableKeyMapping: [] + projectedOutput: 0:int, 1:int, 2:timestamp, 1:int, 4:string + smallTableValueMapping: 4:string + hashTableImplementationType: FAST + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + input vertices: + 1 Map 2 + Statistics: Num rows: 5 Data size: 720 Basic stats: COMPLETE Column stats: COMPLETE + Map Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 _col0 (type: int) + 1 _col0 (type: int) + Map Join Vectorization: + bigTableKeyColumns: 0:int + bigTableRetainColumnNums: [0, 1, 2, 4] + bigTableValueColumns: 0:int, 1:int, 2:timestamp, 1:int, 4:string + className: VectorMapJoinInnerLongOperator + native: true + nativeConditionsMet: hive.mapjoin.optimized.hashtable IS true, hive.vectorized.execution.mapjoin.native.enabled IS true, hive.execution.engine tez IN [tez, spark] IS true, One MapJoin Condition IS true, No nullsafe IS true, Small table vectorizes IS true, Fast Hash Table and No Hybrid Hash Join IS true + nonOuterSmallTableKeyMapping: [] + projectedOutput: 0:int, 1:int, 2:timestamp, 1:int, 4:string, 0:int, 5:string + smallTableValueMapping: 5:string + hashTableImplementationType: FAST + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6 + input vertices: + 1 Map 3 + Statistics: Num rows: 2 Data size: 480 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: _col3 (type: int), _col1 (type: int), _col0 (type: int), _col5 (type: int), _col6 (type: string), _col4 (type: string), _col2 (type: timestamp) + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6 + Select Vectorization: + className: VectorSelectOperator + native: true + projectedOutputColumnNums: [1, 1, 0, 0, 5, 4, 2] + Statistics: Num rows: 2 Data size: 480 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + File Sink Vectorization: + className: VectorFileSinkOperator + native: false + Statistics: Num rows: 2 Data size: 480 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Execution mode: vectorized, llap + LLAP IO: all inputs + Map Vectorization: + enabled: true + enabledConditionsMet: hive.vectorized.use.vectorized.input.format IS true + inputFormatFeatureSupport: [DECIMAL_64] + featureSupportInUse: [DECIMAL_64] + inputFileFormats: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + allNative: false + usesVectorUDFAdaptor: false + vectorized: true + rowBatchContext: + dataColumnCount: 3 + includeColumns: [0, 1, 2] + dataColumns: key3:int, key2:int, dt:timestamp + partitionColumnCount: 0 + scratchColumnTypeNames: [string, string] + Map 2 + Map Operator Tree: + TableScan + alias: item_dim + filterExpr: key1 is not null (type: boolean) + Statistics: Num rows: 5 Data size: 480 Basic stats: COMPLETE Column stats: COMPLETE + TableScan Vectorization: + native: true + vectorizationSchemaColumns: [0:key1:int, 1:name:string, 2:ROW__ID:struct] + Filter Operator + Filter Vectorization: + className: VectorFilterOperator + native: true + predicateExpression: SelectColumnIsNotNull(col 0:int) + predicate: key1 is not null (type: boolean) + Statistics: Num rows: 5 Data size: 480 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: key1 (type: int), name (type: string) + outputColumnNames: _col0, _col1 + Select Vectorization: + className: VectorSelectOperator + native: true + projectedOutputColumnNums: [0, 1] + Statistics: Num rows: 5 Data size: 480 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: int) + null sort order: z + sort order: + + Map-reduce partition columns: _col0 (type: int) + Reduce Sink Vectorization: + className: VectorReduceSinkLongOperator + keyColumns: 0:int + native: true + nativeConditionsMet: hive.vectorized.execution.reducesink.new.enabled IS true, hive.execution.engine tez IN [tez, spark] IS true, No PTF TopN IS true, No DISTINCT columns IS true, BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true + valueColumns: 1:string + Statistics: Num rows: 5 Data size: 480 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: string) + Execution mode: vectorized, llap + LLAP IO: all inputs + Map Vectorization: + enabled: true + enabledConditionsMet: hive.vectorized.use.vectorized.input.format IS true + inputFormatFeatureSupport: [DECIMAL_64] + featureSupportInUse: [DECIMAL_64] + inputFileFormats: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + allNative: true + usesVectorUDFAdaptor: false + vectorized: true + rowBatchContext: + dataColumnCount: 2 + includeColumns: [0, 1] + dataColumns: key1:int, name:string + partitionColumnCount: 0 + scratchColumnTypeNames: [] + Map 3 + Map Operator Tree: + TableScan + alias: seller_dim + filterExpr: key4 is not null (type: boolean) + Statistics: Num rows: 2 Data size: 192 Basic stats: COMPLETE Column stats: COMPLETE + TableScan Vectorization: + native: true + vectorizationSchemaColumns: [0:key4:int, 1:sellername:string, 2:ROW__ID:struct] + Filter Operator + Filter Vectorization: + className: VectorFilterOperator + native: true + predicateExpression: SelectColumnIsNotNull(col 0:int) + predicate: key4 is not null (type: boolean) + Statistics: Num rows: 2 Data size: 192 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: key4 (type: int), sellername (type: string) + outputColumnNames: _col0, _col1 + Select Vectorization: + className: VectorSelectOperator + native: true + projectedOutputColumnNums: [0, 1] + Statistics: Num rows: 2 Data size: 192 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: int) + null sort order: z + sort order: + + Map-reduce partition columns: _col0 (type: int) + Reduce Sink Vectorization: + className: VectorReduceSinkLongOperator + keyColumns: 0:int + native: true + nativeConditionsMet: hive.vectorized.execution.reducesink.new.enabled IS true, hive.execution.engine tez IN [tez, spark] IS true, No PTF TopN IS true, No DISTINCT columns IS true, BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true + valueColumns: 1:string + Statistics: Num rows: 2 Data size: 192 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: string) + Execution mode: vectorized, llap + LLAP IO: all inputs + Map Vectorization: + enabled: true + enabledConditionsMet: hive.vectorized.use.vectorized.input.format IS true + inputFormatFeatureSupport: [DECIMAL_64] + featureSupportInUse: [DECIMAL_64] + inputFileFormats: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + allNative: true + usesVectorUDFAdaptor: false + vectorized: true + rowBatchContext: + dataColumnCount: 2 + includeColumns: [0, 1] + dataColumns: key4:int, sellername:string + partitionColumnCount: 0 + scratchColumnTypeNames: [] + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select key1, key2, key3, key4, sellername, name, dt from orders_fact join item_dim on (orders_fact.key2 = item_dim.key1) join seller_dim on (orders_fact.key3 = seller_dim.key4) +PREHOOK: type: QUERY +PREHOOK: Input: default@item_dim +PREHOOK: Input: default@orders_fact +PREHOOK: Input: default@seller_dim +#### A masked pattern was here #### +POSTHOOK: query: select key1, key2, key3, key4, sellername, name, dt from orders_fact join item_dim on (orders_fact.key2 = item_dim.key1) join seller_dim on (orders_fact.key3 = seller_dim.key4) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@item_dim +POSTHOOK: Input: default@orders_fact +POSTHOOK: Input: default@seller_dim +#### A masked pattern was here #### +101 101 12345 12345 Seller 1 Item 101 2001-01-30 00:00:00 +104 104 23456 23456 Seller 2 Item 104 2002-03-02 00:00:00