diff --git a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java index 1dbae40..dd7ddce 100644 --- a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java +++ b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java @@ -2848,6 +2848,11 @@ private static void populateLlapDaemonVarsSet(Set llapDaemonVarsSetLocal TEZ_DYNAMIC_PARTITION_PRUNING_MAX_DATA_SIZE("hive.tez.dynamic.partition.pruning.max.data.size", 100*1024*1024L, "Maximum total data size of events in dynamic pruning."), + TEZ_DYNAMIC_SEMIJOIN_REDUCTION("hive.tez.dynamic.semijoin.reduction", true, + "When dynamic semijoin is enabled, shuffle joins will perform a leaky semijoin before shuffle. This " + + "requires hive.tez.dynamic.partition.pruning to be enabled."), + TEZ_MAX_BLOOM_FILTER_ENTRIES("hive.tez.max.bloom.filter.entries", 100000000L, + "Bloom filter should be of at max certain size to be effective"), TEZ_SMB_NUMBER_WAVES( "hive.tez.smb.number.waves", (float) 0.5, diff --git a/itests/src/test/resources/testconfiguration.properties b/itests/src/test/resources/testconfiguration.properties index 792de2e..bd76b7d 100644 --- a/itests/src/test/resources/testconfiguration.properties +++ b/itests/src/test/resources/testconfiguration.properties @@ -154,6 +154,7 @@ minillaplocal.shared.query.files=alter_merge_2_orc.q,\ delete_whole_partition.q,\ disable_merge_for_bucketing.q,\ dynamic_partition_pruning.q,\ + dynamic_semijoin_reduction.q,\ dynpart_sort_opt_vectorization.q,\ dynpart_sort_optimization.q,\ dynpart_sort_optimization2.q,\ @@ -480,6 +481,7 @@ minillaplocal.query.files=acid_globallimit.q,\ correlationoptimizer6.q,\ disable_merge_for_bucketing.q,\ dynamic_partition_pruning.q,\ + dynamic_semijoin_reduction.q,\ dynpart_sort_opt_vectorization.q,\ dynpart_sort_optimization.q,\ dynpart_sort_optimization_acid.q,\ diff --git a/orc/src/test/org/apache/orc/impl/TestRecordReaderImpl.java b/orc/src/test/org/apache/orc/impl/TestRecordReaderImpl.java index cdd62ac..30b42ee 100644 --- a/orc/src/test/org/apache/orc/impl/TestRecordReaderImpl.java +++ b/orc/src/test/org/apache/orc/impl/TestRecordReaderImpl.java @@ -76,7 +76,7 @@ public static PredicateLeaf createPredicateLeaf(PredicateLeaf.Operator operator, Object literal, List literalList) { return new SearchArgumentImpl.PredicateLeafImpl(operator, type, columnName, - literal, literalList); + literal, literalList, null); } // can add .verboseLogging() to cause Mockito to log invocations diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/AbstractMapJoinOperator.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/AbstractMapJoinOperator.java index 69ba4a2..669e23e 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/AbstractMapJoinOperator.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/AbstractMapJoinOperator.java @@ -70,7 +70,7 @@ protected void initializeOp(Configuration hconf) throws HiveException { if (conf.getGenJoinKeys()) { int tagLen = conf.getTagLength(); joinKeys = new List[tagLen]; - JoinUtil.populateJoinKeyValue(joinKeys, conf.getKeys(), NOTSKIPBIGTABLE); + JoinUtil.populateJoinKeyValue(joinKeys, conf.getKeys(), NOTSKIPBIGTABLE, hconf); joinKeysObjectInspectors = JoinUtil.getObjectInspectorsFromEvaluators(joinKeys, inputObjInspectors,NOTSKIPBIGTABLE, tagLen); } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/CommonJoinOperator.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/CommonJoinOperator.java index 7e9007c..df1898e 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/CommonJoinOperator.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/CommonJoinOperator.java @@ -254,11 +254,11 @@ protected void initializeOp(Configuration hconf) throws HiveException { noOuterJoin = conf.isNoOuterJoin(); totalSz = JoinUtil.populateJoinKeyValue(joinValues, conf.getExprs(), - order,NOTSKIPBIGTABLE); + order,NOTSKIPBIGTABLE, hconf); //process join filters joinFilters = new List[tagLen]; - JoinUtil.populateJoinKeyValue(joinFilters, conf.getFilters(),order,NOTSKIPBIGTABLE); + JoinUtil.populateJoinKeyValue(joinFilters, conf.getFilters(),order,NOTSKIPBIGTABLE, hconf); joinValuesObjectInspectors = JoinUtil.getObjectInspectorsFromEvaluators(joinValues, diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/DynamicValueRegistry.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/DynamicValueRegistry.java new file mode 100644 index 0000000..63336bd --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/DynamicValueRegistry.java @@ -0,0 +1,30 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec; + +public interface DynamicValueRegistry { + + // Abstract class to hold info required for the implementation + public static abstract class RegistryConf { + } + + Object getValue(String key) throws Exception; + + void init(RegistryConf conf) throws Exception; +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/ExprNodeColumnEvaluator.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/ExprNodeColumnEvaluator.java index 24c8281..b0384df 100755 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/ExprNodeColumnEvaluator.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/ExprNodeColumnEvaluator.java @@ -18,6 +18,7 @@ package org.apache.hadoop.hive.ql.exec; +import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; @@ -38,8 +39,8 @@ private transient StructField[] fields; private transient boolean[] unionField; - public ExprNodeColumnEvaluator(ExprNodeColumnDesc expr) { - super(expr); + public ExprNodeColumnEvaluator(ExprNodeColumnDesc expr, Configuration conf) { + super(expr, conf); } @Override diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/ExprNodeConstantDefaultEvaluator.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/ExprNodeConstantDefaultEvaluator.java index 89a75eb..f53c3e3 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/ExprNodeConstantDefaultEvaluator.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/ExprNodeConstantDefaultEvaluator.java @@ -18,6 +18,7 @@ package org.apache.hadoop.hive.ql.exec; +import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDefaultDesc; import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc; @@ -33,7 +34,11 @@ transient ObjectInspector writableObjectInspector; public ExprNodeConstantDefaultEvaluator(ExprNodeConstantDefaultDesc expr) { - super(expr); + this(expr, null); + } + + public ExprNodeConstantDefaultEvaluator(ExprNodeConstantDefaultDesc expr, Configuration conf) { + super(expr, conf); writableObjectInspector = expr.getWritableObjectInspector(); } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/ExprNodeConstantEvaluator.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/ExprNodeConstantEvaluator.java index 4fe72a0..ca39e21 100755 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/ExprNodeConstantEvaluator.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/ExprNodeConstantEvaluator.java @@ -18,6 +18,7 @@ package org.apache.hadoop.hive.ql.exec; +import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc; import org.apache.hadoop.hive.serde2.objectinspector.ConstantObjectInspector; @@ -32,7 +33,11 @@ transient ConstantObjectInspector writableObjectInspector; public ExprNodeConstantEvaluator(ExprNodeConstantDesc expr) { - super(expr); + this(expr, null); + } + + public ExprNodeConstantEvaluator(ExprNodeConstantDesc expr, Configuration conf) { + super(expr, conf); writableObjectInspector = expr.getWritableObjectInspector(); } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/ExprNodeDynamicValueEvaluator.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/ExprNodeDynamicValueEvaluator.java new file mode 100644 index 0000000..6c68215 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/ExprNodeDynamicValueEvaluator.java @@ -0,0 +1,54 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.plan.DynamicValue; +import org.apache.hadoop.hive.ql.plan.ExprNodeDynamicValueDesc; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.ObjectInspectorCopyOption; + +/** + * ExprNodeDynamicEvaluator. + * + */ +public class ExprNodeDynamicValueEvaluator extends ExprNodeEvaluator { + + transient ObjectInspector oi; + + public ExprNodeDynamicValueEvaluator(ExprNodeDynamicValueDesc expr, Configuration conf) { + super(expr, conf); + oi = ObjectInspectorUtils.getStandardObjectInspector(expr.getWritableObjectInspector(), ObjectInspectorCopyOption.WRITABLE); + } + + @Override + public ObjectInspector initialize(ObjectInspector rowInspector) throws HiveException { + return oi; + } + + @Override + protected Object _evaluate(Object row, int version) throws HiveException { + DynamicValue dynamicValue = expr.getDynamicValue(); + dynamicValue.setConf(conf); + return dynamicValue.getWritableValue(); + } + +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/ExprNodeEvaluator.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/ExprNodeEvaluator.java index b8d6ab7..375d65f 100755 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/ExprNodeEvaluator.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/ExprNodeEvaluator.java @@ -18,6 +18,7 @@ package org.apache.hadoop.hive.ql.exec; +import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; @@ -30,9 +31,11 @@ protected final T expr; protected ObjectInspector outputOI; + protected Configuration conf; - public ExprNodeEvaluator(T expr) { + public ExprNodeEvaluator(T expr, Configuration conf) { this.expr = expr; + this.conf = conf; } /** @@ -109,4 +112,12 @@ public boolean isStateful() { public String toString() { return "ExprNodeEvaluator[" + expr + "]"; } + + public Configuration getConf() { + return conf; + } + + public void setConf(Configuration conf) { + this.conf = conf; + } } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/ExprNodeEvaluatorFactory.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/ExprNodeEvaluatorFactory.java index 0d03d8f..34aec55 100755 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/ExprNodeEvaluatorFactory.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/ExprNodeEvaluatorFactory.java @@ -21,11 +21,13 @@ import java.util.HashMap; import java.util.Map; +import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc; import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDefaultDesc; import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc; import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; +import org.apache.hadoop.hive.ql.plan.ExprNodeDynamicValueDesc; import org.apache.hadoop.hive.ql.plan.ExprNodeFieldDesc; import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc; @@ -39,9 +41,13 @@ private ExprNodeEvaluatorFactory() { } public static ExprNodeEvaluator get(ExprNodeDesc desc) throws HiveException { + return get(desc, null); + } + + public static ExprNodeEvaluator get(ExprNodeDesc desc, Configuration conf) throws HiveException { // Constant node if (desc instanceof ExprNodeConstantDesc) { - return new ExprNodeConstantEvaluator((ExprNodeConstantDesc) desc); + return new ExprNodeConstantEvaluator((ExprNodeConstantDesc) desc, conf); } // Special 'default' constant node @@ -51,15 +57,19 @@ public static ExprNodeEvaluator get(ExprNodeDesc desc) throws HiveException { // Column-reference node, e.g. a column in the input row if (desc instanceof ExprNodeColumnDesc) { - return new ExprNodeColumnEvaluator((ExprNodeColumnDesc) desc); + return new ExprNodeColumnEvaluator((ExprNodeColumnDesc) desc, conf); } // Generic Function node, e.g. CASE, an operator or a UDF node if (desc instanceof ExprNodeGenericFuncDesc) { - return new ExprNodeGenericFuncEvaluator((ExprNodeGenericFuncDesc) desc); + return new ExprNodeGenericFuncEvaluator((ExprNodeGenericFuncDesc) desc, conf); } // Field node, e.g. get a.myfield1 from a if (desc instanceof ExprNodeFieldDesc) { - return new ExprNodeFieldEvaluator((ExprNodeFieldDesc) desc); + return new ExprNodeFieldEvaluator((ExprNodeFieldDesc) desc, conf); + } + // Dynamic value which will be determined during query runtime + if (desc instanceof ExprNodeDynamicValueDesc) { + return new ExprNodeDynamicValueEvaluator((ExprNodeDynamicValueDesc) desc, conf); } throw new RuntimeException( "Cannot find ExprNodeEvaluator for the exprNodeDesc = " + desc); diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/ExprNodeEvaluatorHead.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/ExprNodeEvaluatorHead.java index 42685fb..991bc13 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/ExprNodeEvaluatorHead.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/ExprNodeEvaluatorHead.java @@ -30,7 +30,7 @@ private final ExprNodeEvaluator referencing; public ExprNodeEvaluatorHead(ExprNodeEvaluator referencing) { - super(referencing.getExpr()); + super(referencing.getExpr(), referencing.getConf()); this.referencing = referencing; } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/ExprNodeEvaluatorRef.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/ExprNodeEvaluatorRef.java index 0a6b66a..625d486 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/ExprNodeEvaluatorRef.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/ExprNodeEvaluatorRef.java @@ -30,7 +30,7 @@ private final ExprNodeEvaluator referencing; public ExprNodeEvaluatorRef(ExprNodeEvaluator referencing) { - super(referencing.getExpr()); + super(referencing.getExpr(), referencing.getConf()); this.referencing = referencing; } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/ExprNodeFieldEvaluator.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/ExprNodeFieldEvaluator.java index ff32626..1241343 100755 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/ExprNodeFieldEvaluator.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/ExprNodeFieldEvaluator.java @@ -21,6 +21,7 @@ import java.util.ArrayList; import java.util.List; +import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.plan.ExprNodeFieldDesc; import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector; @@ -43,9 +44,9 @@ transient ObjectInspector structFieldObjectInspector; transient ObjectInspector resultObjectInspector; - public ExprNodeFieldEvaluator(ExprNodeFieldDesc desc) throws HiveException { - super(desc); - leftEvaluator = ExprNodeEvaluatorFactory.get(desc.getDesc()); + public ExprNodeFieldEvaluator(ExprNodeFieldDesc desc, Configuration conf) throws HiveException { + super(desc, conf); + leftEvaluator = ExprNodeEvaluatorFactory.get(desc.getDesc(), conf); } @Override diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/ExprNodeGenericFuncEvaluator.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/ExprNodeGenericFuncEvaluator.java index 221abd9..8b9baa6 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/ExprNodeGenericFuncEvaluator.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/ExprNodeGenericFuncEvaluator.java @@ -20,6 +20,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc; @@ -91,13 +92,13 @@ public Object get() throws HiveException { } } - public ExprNodeGenericFuncEvaluator(ExprNodeGenericFuncDesc expr) throws HiveException { - super(expr); + public ExprNodeGenericFuncEvaluator(ExprNodeGenericFuncDesc expr, Configuration conf) throws HiveException { + super(expr, conf); children = new ExprNodeEvaluator[expr.getChildren().size()]; isEager = false; for (int i = 0; i < children.length; i++) { ExprNodeDesc child = expr.getChildren().get(i); - ExprNodeEvaluator nodeEvaluator = ExprNodeEvaluatorFactory.get(child); + ExprNodeEvaluator nodeEvaluator = ExprNodeEvaluatorFactory.get(child, conf); children[i] = nodeEvaluator; // If we have eager evaluators anywhere below us, then we are eager too. if (nodeEvaluator instanceof ExprNodeGenericFuncEvaluator) { diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/FilterOperator.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/FilterOperator.java index bd0d28c..df30ab2 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/FilterOperator.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/FilterOperator.java @@ -60,7 +60,7 @@ protected void initializeOp(Configuration hconf) throws HiveException { try { heartbeatInterval = HiveConf.getIntVar(hconf, HiveConf.ConfVars.HIVESENDHEARTBEAT); - conditionEvaluator = ExprNodeEvaluatorFactory.get(conf.getPredicate()); + conditionEvaluator = ExprNodeEvaluatorFactory.get(conf.getPredicate(), hconf); if (HiveConf.getBoolVar(hconf, HiveConf.ConfVars.HIVEEXPREVALUATIONCACHE)) { conditionEvaluator = ExprNodeEvaluatorFactory.toCachedEval(conditionEvaluator); } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java index 4fce1ac..e166eee 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java @@ -370,6 +370,7 @@ system.registerGenericUDF("not", GenericUDFOPNot.class); system.registerGenericUDF("!", GenericUDFOPNot.class); system.registerGenericUDF("between", GenericUDFBetween.class); + system.registerGenericUDF("in_bloom_filter", GenericUDFInBloomFilter.class); system.registerGenericUDF("ewah_bitmap_and", GenericUDFEWAHBitmapAnd.class); system.registerGenericUDF("ewah_bitmap_or", GenericUDFEWAHBitmapOr.class); @@ -427,7 +428,7 @@ system.registerGenericUDAF("ewah_bitmap", new GenericUDAFEWAHBitmap()); system.registerGenericUDAF("compute_stats", new GenericUDAFComputeStats()); - + system.registerGenericUDAF("bloom_filter", new GenericUDAFBloomFilter()); system.registerUDAF("percentile", UDAFPercentile.class); @@ -472,7 +473,6 @@ system.registerGenericUDF("to_unix_timestamp", GenericUDFToUnixTimeStamp.class); system.registerGenericUDF("internal_interval", GenericUDFInternalInterval.class); - // Generic UDTF's system.registerGenericUDTF("explode", GenericUDTFExplode.class); system.registerGenericUDTF("replicate_rows", GenericUDTFReplicateRows.class); diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/GroupByOperator.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/GroupByOperator.java index 073147f..be561ce 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/GroupByOperator.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/GroupByOperator.java @@ -212,7 +212,7 @@ protected void initializeOp(Configuration hconf) throws HiveException { keyObjectInspectors = new ObjectInspector[numKeys]; currentKeyObjectInspectors = new ObjectInspector[numKeys]; for (int i = 0; i < numKeys; i++) { - keyFields[i] = ExprNodeEvaluatorFactory.get(conf.getKeys().get(i)); + keyFields[i] = ExprNodeEvaluatorFactory.get(conf.getKeys().get(i), hconf); keyObjectInspectors[i] = keyFields[i].initialize(rowInspector); currentKeyObjectInspectors[i] = ObjectInspectorUtils .getStandardObjectInspector(keyObjectInspectors[i], @@ -258,7 +258,7 @@ protected void initializeOp(Configuration hconf) throws HiveException { new ExprNodeColumnDesc(TypeInfoUtils.getTypeInfoFromObjectInspector( sf.getFieldObjectInspector()), keyField.getFieldName() + "." + sf.getFieldName(), null, - false)); + false), hconf); unionExprEval.initialize(rowInspector); } } @@ -283,7 +283,7 @@ protected void initializeOp(Configuration hconf) throws HiveException { aggregationParameterObjects[i] = new Object[parameters.size()]; for (int j = 0; j < parameters.size(); j++) { aggregationParameterFields[i][j] = ExprNodeEvaluatorFactory - .get(parameters.get(j)); + .get(parameters.get(j), hconf); aggregationParameterObjectInspectors[i][j] = aggregationParameterFields[i][j] .initialize(rowInspector); if (unionExprEval != null) { @@ -352,6 +352,21 @@ protected void initializeOp(Configuration hconf) throws HiveException { } } + // grouping id should be pruned, which is the last of key columns + // see ColumnPrunerGroupByProc + outputKeyLength = conf.pruneGroupingSetId() ? keyFields.length - 1 : keyFields.length; + + // init objectInspectors + ObjectInspector[] objectInspectors = + new ObjectInspector[outputKeyLength + aggregationEvaluators.length]; + for (int i = 0; i < outputKeyLength; i++) { + objectInspectors[i] = currentKeyObjectInspectors[i]; + } + for (int i = 0; i < aggregationEvaluators.length; i++) { + objectInspectors[outputKeyLength + i] = aggregationEvaluators[i].init(conf.getAggregators() + .get(i).getMode(), aggregationParameterObjectInspectors[i]); + } + aggregationsParametersLastInvoke = new Object[conf.getAggregators().size()][]; if ((conf.getMode() != GroupByDesc.Mode.HASH || conf.getBucketGroup()) && (!groupingSetsPresent)) { @@ -374,21 +389,6 @@ protected void initializeOp(Configuration hconf) throws HiveException { List fieldNames = new ArrayList(conf.getOutputColumnNames()); - // grouping id should be pruned, which is the last of key columns - // see ColumnPrunerGroupByProc - outputKeyLength = conf.pruneGroupingSetId() ? keyFields.length - 1 : keyFields.length; - - // init objectInspectors - ObjectInspector[] objectInspectors = - new ObjectInspector[outputKeyLength + aggregationEvaluators.length]; - for (int i = 0; i < outputKeyLength; i++) { - objectInspectors[i] = currentKeyObjectInspectors[i]; - } - for (int i = 0; i < aggregationEvaluators.length; i++) { - objectInspectors[outputKeyLength + i] = aggregationEvaluators[i].init(conf.getAggregators() - .get(i).getMode(), aggregationParameterObjectInspectors[i]); - } - outputObjInspector = ObjectInspectorFactory .getStandardStructObjectInspector(fieldNames, Arrays.asList(objectInspectors)); diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/HashTableSinkOperator.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/HashTableSinkOperator.java index ac5331e..3a366f6 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/HashTableSinkOperator.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/HashTableSinkOperator.java @@ -143,19 +143,19 @@ protected void initializeOp(Configuration hconf) throws HiveException { // process join keys joinKeys = new List[tagLen]; - JoinUtil.populateJoinKeyValue(joinKeys, conf.getKeys(), posBigTableAlias); + JoinUtil.populateJoinKeyValue(joinKeys, conf.getKeys(), posBigTableAlias, hconf); joinKeysObjectInspectors = JoinUtil.getObjectInspectorsFromEvaluators(joinKeys, inputObjInspectors, posBigTableAlias, tagLen); // process join values joinValues = new List[tagLen]; - JoinUtil.populateJoinKeyValue(joinValues, conf.getExprs(), posBigTableAlias); + JoinUtil.populateJoinKeyValue(joinValues, conf.getExprs(), posBigTableAlias, hconf); joinValuesObjectInspectors = JoinUtil.getObjectInspectorsFromEvaluators(joinValues, inputObjInspectors, posBigTableAlias, tagLen); // process join filters joinFilters = new List[tagLen]; - JoinUtil.populateJoinKeyValue(joinFilters, conf.getFilters(), posBigTableAlias); + JoinUtil.populateJoinKeyValue(joinFilters, conf.getFilters(), posBigTableAlias, hconf); joinFilterObjectInspectors = JoinUtil.getObjectInspectorsFromEvaluators(joinFilters, inputObjInspectors, posBigTableAlias, tagLen); diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/JoinUtil.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/JoinUtil.java index 9718c48..07a3dc6 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/JoinUtil.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/JoinUtil.java @@ -121,14 +121,14 @@ } public static int populateJoinKeyValue(List[] outMap, - Map> inputMap, int posBigTableAlias) throws HiveException { - return populateJoinKeyValue(outMap, inputMap, null, posBigTableAlias); + Map> inputMap, int posBigTableAlias, Configuration conf) throws HiveException { + return populateJoinKeyValue(outMap, inputMap, null, posBigTableAlias, conf); } public static int populateJoinKeyValue(List[] outMap, Map> inputMap, Byte[] order, - int posBigTableAlias) throws HiveException { + int posBigTableAlias, Configuration conf) throws HiveException { int total = 0; for (Entry> e : inputMap.entrySet()) { if (e.getValue() == null) { @@ -140,7 +140,7 @@ public static int populateJoinKeyValue(List[] outMap, if (key == (byte) posBigTableAlias) { valueFields.add(null); } else { - valueFields.add(ExprNodeEvaluatorFactory.get(expr)); + valueFields.add(ExprNodeEvaluatorFactory.get(expr, conf)); } } outMap[key] = valueFields; diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/ObjectCache.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/ObjectCache.java index 440e0a1..b931c95 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/ObjectCache.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/ObjectCache.java @@ -44,6 +44,16 @@ public T retrieve(String key, Callable fn) throws HiveException; /** + * Retrieve object from cache. + * + * @param + * @param key + * function to generate the object if it's not there + * @return the last cached object with the key, null if none. + */ + public T retrieve(String key) throws HiveException; + + /** * Retrieve object from cache asynchronously. * * @param diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/ObjectCacheWrapper.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/ObjectCacheWrapper.java index 9768efa..71bcd98 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/ObjectCacheWrapper.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/ObjectCacheWrapper.java @@ -36,6 +36,11 @@ public void release(String key) { } @Override + public T retrieve(String key) throws HiveException { + return globalCache.retrieve(makeKey(key)); + } + + @Override public T retrieve(String key, Callable fn) throws HiveException { return globalCache.retrieve(makeKey(key), fn); } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/SelectOperator.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/SelectOperator.java index 9049ddd..a30c771 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/SelectOperator.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/SelectOperator.java @@ -63,7 +63,7 @@ protected void initializeOp(Configuration hconf) throws HiveException { eval = new ExprNodeEvaluator[colList.size()]; for (int i = 0; i < colList.size(); i++) { assert (colList.get(i) != null); - eval[i] = ExprNodeEvaluatorFactory.get(colList.get(i)); + eval[i] = ExprNodeEvaluatorFactory.get(colList.get(i), hconf); } if (HiveConf.getBoolVar(hconf, HiveConf.ConfVars.HIVEEXPREVALUATIONCACHE)) { eval = ExprNodeEvaluatorFactory.toCachedEvals(eval); diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/mr/ObjectCache.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/mr/ObjectCache.java index 008f8a4..cfe1750 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/mr/ObjectCache.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/mr/ObjectCache.java @@ -47,6 +47,11 @@ public void release(String key) { } @Override + public T retrieve(String key) throws HiveException { + return retrieve(key, null); + } + + @Override public T retrieve(String key, Callable fn) throws HiveException { try { if (isDebugEnabled) { diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/DynamicValueRegistryTez.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/DynamicValueRegistryTez.java new file mode 100644 index 0000000..7bbedf6 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/DynamicValueRegistryTez.java @@ -0,0 +1,131 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.tez; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hive.ql.exec.ExprNodeEvaluator; +import org.apache.hadoop.hive.ql.exec.ExprNodeEvaluatorFactory; +import org.apache.hadoop.hive.ql.exec.DynamicValueRegistry; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.parse.RuntimeValuesInfo; +import org.apache.hadoop.hive.ql.plan.BaseWork; +import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; +import org.apache.hadoop.hive.ql.plan.TableDesc; +import org.apache.hadoop.hive.serde2.Deserializer; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; +import org.apache.hadoop.io.Writable; +import org.apache.hadoop.util.ReflectionUtils; +import org.apache.tez.runtime.api.Input; +import org.apache.tez.runtime.api.LogicalInput; +import org.apache.tez.runtime.api.ProcessorContext; +import org.apache.tez.runtime.library.api.KeyValueReader; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class DynamicValueRegistryTez implements DynamicValueRegistry { + private static final Logger LOG = LoggerFactory.getLogger(DynamicValueRegistryTez.class); + + public static class RegistryConfTez extends RegistryConf { + public Configuration conf; + public BaseWork baseWork; + public ProcessorContext processorContext; + public Map inputs; + + public RegistryConfTez(Configuration conf, BaseWork baseWork, + ProcessorContext processorContext, Map inputs) { + super(); + this.conf = conf; + this.baseWork = baseWork; + this.processorContext = processorContext; + this.inputs = inputs; + } + } + + protected Map values = Collections.synchronizedMap(new HashMap()); + + public DynamicValueRegistryTez() { + } + + @Override + public Object getValue(String key) { + if (!values.containsKey(key)) { + throw new IllegalStateException("Value does not exist in registry: " + key); + } + return values.get(key); + } + + protected void setValue(String key, Object value) { + values.put(key, value); + } + + @Override + public void init(RegistryConf conf) throws Exception { + RegistryConfTez rct = (RegistryConfTez) conf; + + for (String inputSourceName : rct.baseWork.getInputSourceToRuntimeValuesInfo().keySet()) { + LOG.info("Runtime value source: " + inputSourceName); + + LogicalInput runtimeValueInput = rct.inputs.get(inputSourceName); + RuntimeValuesInfo runtimeValuesInfo = rct.baseWork.getInputSourceToRuntimeValuesInfo().get(inputSourceName); + + // Setup deserializer/obj inspectors for the incoming data source + Deserializer deserializer = ReflectionUtils.newInstance(runtimeValuesInfo.getTableDesc().getDeserializerClass(), null); + deserializer.initialize(rct.conf, runtimeValuesInfo.getTableDesc().getProperties()); + ObjectInspector inspector = deserializer.getObjectInspector(); + + // Set up col expressions for the dynamic values using this input + List colExprEvaluators = new ArrayList(); + for (ExprNodeDesc expr : runtimeValuesInfo.getColExprs()) { + ExprNodeEvaluator exprEval = ExprNodeEvaluatorFactory.get(expr, null); + exprEval.initialize(inspector); + colExprEvaluators.add(exprEval); + } + + runtimeValueInput.start(); + List inputList = new ArrayList(); + inputList.add(runtimeValueInput); + rct.processorContext.waitForAllInputsReady(inputList); + + KeyValueReader kvReader = (KeyValueReader) runtimeValueInput.getReader(); + long rowCount = 0; + while (kvReader.next()) { + Object row = deserializer.deserialize((Writable) kvReader.getCurrentValue()); + rowCount++; + for (int colIdx = 0; colIdx < colExprEvaluators.size(); ++colIdx) { + // Read each expression and save it to the value registry + ExprNodeEvaluator eval = colExprEvaluators.get(colIdx); + Object val = eval.evaluate(row); + setValue(runtimeValuesInfo.getDynamicValueIDs().get(colIdx), val); + } + } + // For now, expecting a single row (min/max, aggregated bloom filter) + if (rowCount != 1) { + throw new IllegalStateException("Expected 1 row from " + inputSourceName + ", got " + rowCount); + } + } + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/LlapObjectCache.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/LlapObjectCache.java index 0141230..1ce8ee9 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/LlapObjectCache.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/LlapObjectCache.java @@ -60,6 +60,24 @@ public void release(String key) { @SuppressWarnings("unchecked") @Override + public T retrieve(String key) throws HiveException { + + T value = null; + + lock.lock(); + try { + value = (T) registry.getIfPresent(key); + if (value != null && isLogDebugEnabled) { + LOG.debug("Found " + key + " in cache"); + } + return value; + } finally { + lock.unlock(); + } + } + + @SuppressWarnings("unchecked") + @Override public T retrieve(String key, Callable fn) throws HiveException { T value = null; diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/MapRecordProcessor.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/MapRecordProcessor.java index 955fa80..790c9d8 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/MapRecordProcessor.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/MapRecordProcessor.java @@ -51,11 +51,13 @@ import org.apache.hadoop.hive.ql.exec.Utilities; import org.apache.hadoop.hive.ql.exec.mr.ExecMapper.ReportStats; import org.apache.hadoop.hive.ql.exec.mr.ExecMapperContext; +import org.apache.hadoop.hive.ql.exec.tez.DynamicValueRegistryTez.RegistryConfTez; import org.apache.hadoop.hive.ql.exec.tez.TezProcessor.TezKVOutputCollector; import org.apache.hadoop.hive.ql.exec.tez.tools.KeyValueInputMerger; import org.apache.hadoop.hive.ql.exec.vector.VectorMapOperator; import org.apache.hadoop.hive.ql.log.PerfLogger; import org.apache.hadoop.hive.ql.plan.BaseWork; +import org.apache.hadoop.hive.ql.plan.DynamicValue; import org.apache.hadoop.hive.ql.plan.MapWork; import org.apache.hadoop.hive.ql.plan.OperatorDesc; import org.apache.hadoop.hive.serde2.Deserializer; @@ -88,8 +90,8 @@ private final ExecMapperContext execContext; private MapWork mapWork; List mergeWorkList; - List cacheKeys; - ObjectCache cache; + List cacheKeys, dynamicValueCacheKeys; + ObjectCache cache, dynamicValueCache; private int nRows; public MapRecordProcessor(final JobConf jconf, final ProcessorContext context) throws Exception { @@ -99,9 +101,11 @@ public MapRecordProcessor(final JobConf jconf, final ProcessorContext context) t setLlapOfFragmentId(context); } cache = ObjectCacheFactory.getCache(jconf, queryId, true); + dynamicValueCache = ObjectCacheFactory.getCache(jconf, queryId, false); execContext = new ExecMapperContext(jconf); execContext.setJc(jconf); cacheKeys = new ArrayList(); + dynamicValueCacheKeys = new ArrayList(); nRows = 0; } @@ -295,6 +299,21 @@ public Object call() { mapOp.initializeLocalWork(jconf); + // Setup values registry + checkAbortCondition(); + String valueRegistryKey = DynamicValue.DYNAMIC_VALUE_REGISTRY_CACHE_KEY; + // On LLAP dynamic value registry might already be cached. + final DynamicValueRegistryTez registryTez = dynamicValueCache.retrieve(valueRegistryKey, + new Callable() { + @Override + public DynamicValueRegistryTez call() { + return new DynamicValueRegistryTez(); + } + }); + dynamicValueCacheKeys.add(valueRegistryKey); + RegistryConfTez registryConf = new RegistryConfTez(jconf, mapWork, processorContext, inputs); + registryTez.init(registryConf); + checkAbortCondition(); initializeMapRecordSources(); mapOp.initializeMapOperator(jconf); @@ -435,6 +454,12 @@ void close(){ } } + if (dynamicValueCache != null && dynamicValueCacheKeys != null) { + for (String k: dynamicValueCacheKeys) { + dynamicValueCache.release(k); + } + } + // detecting failed executions by exceptions thrown by the operator tree try { if (mapOp == null || mapWork == null) { diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/ObjectCache.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/ObjectCache.java index 06dca00..72dcdd3 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/ObjectCache.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/ObjectCache.java @@ -65,6 +65,22 @@ public void release(String key) { LOG.info("Releasing key: " + key); } + + @SuppressWarnings("unchecked") + @Override + public T retrieve(String key) throws HiveException { + T value = null; + try { + value = (T) registry.get(key); + if ( value != null) { + LOG.info("Found " + key + " in cache with value: " + value); + } + } catch (Exception e) { + throw new HiveException(e); + } + return value; + } + @SuppressWarnings("unchecked") @Override public T retrieve(String key, Callable fn) throws HiveException { diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/ReduceRecordProcessor.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/ReduceRecordProcessor.java index d80f201..2d06545 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/ReduceRecordProcessor.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/ReduceRecordProcessor.java @@ -40,9 +40,11 @@ import org.apache.hadoop.hive.ql.exec.OperatorUtils; import org.apache.hadoop.hive.ql.exec.Utilities; import org.apache.hadoop.hive.ql.exec.mr.ExecMapper.ReportStats; +import org.apache.hadoop.hive.ql.exec.tez.DynamicValueRegistryTez.RegistryConfTez; import org.apache.hadoop.hive.ql.exec.tez.TezProcessor.TezKVOutputCollector; import org.apache.hadoop.hive.ql.log.PerfLogger; import org.apache.hadoop.hive.ql.plan.BaseWork; +import org.apache.hadoop.hive.ql.plan.DynamicValue; import org.apache.hadoop.hive.ql.plan.ReduceWork; import org.apache.hadoop.hive.ql.plan.TableDesc; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; @@ -64,14 +66,14 @@ private static final String REDUCE_PLAN_KEY = "__REDUCE_PLAN__"; - private ObjectCache cache; + private ObjectCache cache, dynamicValueCache; public static final Logger l4j = LoggerFactory.getLogger(ReduceRecordProcessor.class); private ReduceWork reduceWork; List mergeWorkList = null; - List cacheKeys; + List cacheKeys, dynamicValueCacheKeys; private final Map connectOps = new TreeMap(); @@ -91,9 +93,11 @@ public ReduceRecordProcessor(final JobConf jconf, final ProcessorContext context String queryId = HiveConf.getVar(jconf, HiveConf.ConfVars.HIVEQUERYID); cache = ObjectCacheFactory.getCache(jconf, queryId, true); + dynamicValueCache = ObjectCacheFactory.getCache(jconf, queryId, false); String cacheKey = processorContext.getTaskVertexName() + REDUCE_PLAN_KEY; cacheKeys = Lists.newArrayList(cacheKey); + dynamicValueCacheKeys = new ArrayList(); reduceWork = (ReduceWork) cache.retrieve(cacheKey, new Callable() { @Override public Object call() { @@ -169,6 +173,21 @@ void init( l4j.info("Memory available for operators set to {}", LlapUtil.humanReadableByteCount(memoryAvailableToTask)); } OperatorUtils.setMemoryAvailable(reducer.getChildOperators(), memoryAvailableToTask); + + // Setup values registry + String valueRegistryKey = DynamicValue.DYNAMIC_VALUE_REGISTRY_CACHE_KEY; + DynamicValueRegistryTez registryTez = dynamicValueCache.retrieve(valueRegistryKey, + new Callable() { + @Override + public DynamicValueRegistryTez call() { + return new DynamicValueRegistryTez(); + } + }); + dynamicValueCacheKeys.add(valueRegistryKey); + RegistryConfTez registryConf = new RegistryConfTez(jconf, reduceWork, processorContext, inputs); + registryTez.init(registryConf); + checkAbortCondition(); + if (numTags > 1) { sources = new ReduceRecordSource[numTags]; mainWorkOIs = new ObjectInspector[numTags]; @@ -348,6 +367,12 @@ void close(){ } } + if (dynamicValueCache != null && dynamicValueCacheKeys != null) { + for (String k: dynamicValueCacheKeys) { + dynamicValueCache.release(k); + } + } + try { for (ReduceRecordSource rs: sources) { abort = abort && rs.close(); diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorMapJoinOperator.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorMapJoinOperator.java index 0cb6c8a..848fc8e 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorMapJoinOperator.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorMapJoinOperator.java @@ -153,7 +153,7 @@ public void assign(VectorExpressionWriter[] writers, List oids) VectorExpression vectorExpr = bigTableValueExpressions[i]; // This is a vectorized aware evaluator - ExprNodeEvaluator eval = new ExprNodeEvaluator(desc) { + ExprNodeEvaluator eval = new ExprNodeEvaluator(desc, hconf) { int columnIndex; int writerIndex; diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorSMBMapJoinOperator.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorSMBMapJoinOperator.java index 80b0a14..ac3363e 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorSMBMapJoinOperator.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorSMBMapJoinOperator.java @@ -207,7 +207,7 @@ public void assign(VectorExpressionWriter[] writers, List oids) VectorExpression vectorExpr = bigTableValueExpressions[i]; // This is a vectorized aware evaluator - ExprNodeEvaluator eval = new ExprNodeEvaluator(desc) { + ExprNodeEvaluator eval = new ExprNodeEvaluator(desc, hconf) { int columnIndex;; int writerIndex; diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/sarg/ConvertAstToSearchArg.java b/ql/src/java/org/apache/hadoop/hive/ql/io/sarg/ConvertAstToSearchArg.java index 9d900e4..997334b 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/sarg/ConvertAstToSearchArg.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/sarg/ConvertAstToSearchArg.java @@ -27,9 +27,11 @@ import org.apache.hadoop.hive.common.type.HiveChar; import org.apache.hadoop.hive.common.type.HiveDecimal; import org.apache.hadoop.hive.ql.exec.SerializationUtilities; +import org.apache.hadoop.hive.ql.io.sarg.LiteralDelegate; import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc; import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc; import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; +import org.apache.hadoop.hive.ql.plan.ExprNodeDynamicValueDesc; import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc; import org.apache.hadoop.hive.ql.plan.TableScanDesc; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFBetween; @@ -58,14 +60,16 @@ public class ConvertAstToSearchArg { private static final Logger LOG = LoggerFactory.getLogger(ConvertAstToSearchArg.class); - private final SearchArgument.Builder builder = - SearchArgumentFactory.newBuilder(); + private final SearchArgument.Builder builder; + private final Configuration conf; /** * Builds the expression and leaf list from the original predicate. * @param expression the expression to translate. */ - ConvertAstToSearchArg(ExprNodeGenericFuncDesc expression) { + ConvertAstToSearchArg(Configuration conf, ExprNodeGenericFuncDesc expression) { + this.conf = conf; + builder = SearchArgumentFactory.newBuilder(conf); parse(expression); } @@ -182,7 +186,7 @@ private static Object boxLiteral(ExprNodeConstantDesc constantDesc, * @param type the type of the expression * @return the literal boxed if found or null */ - private static Object findLiteral(ExprNodeGenericFuncDesc expr, + private static Object findLiteral(Configuration conf, ExprNodeGenericFuncDesc expr, PredicateLeaf.Type type) { List children = expr.getChildren(); if (children.size() != 2) { @@ -190,16 +194,29 @@ private static Object findLiteral(ExprNodeGenericFuncDesc expr, } Object result = null; for(ExprNodeDesc child: children) { - if (child instanceof ExprNodeConstantDesc) { + Object currentResult = getLiteral(conf, child, type); + if (currentResult != null) { + // Both children in the expression should not be literal if (result != null) { return null; } - result = boxLiteral((ExprNodeConstantDesc) child, type); + result = currentResult; } } return result; } + private static Object getLiteral(Configuration conf, ExprNodeDesc child, PredicateLeaf.Type type) { + if (child instanceof ExprNodeConstantDesc) { + return boxLiteral((ExprNodeConstantDesc) child, type); + } else if (child instanceof ExprNodeDynamicValueDesc) { + LiteralDelegate value = ((ExprNodeDynamicValueDesc) child).getDynamicValue(); + value.setConf(conf); + return value; + } + return null; + } + /** * Return the boxed literal at the given position * @param expr the parent node @@ -207,15 +224,12 @@ private static Object findLiteral(ExprNodeGenericFuncDesc expr, * @param position the child position to check * @return the boxed literal if found otherwise null */ - private static Object getLiteral(ExprNodeGenericFuncDesc expr, + private static Object getLiteral(Configuration conf, ExprNodeGenericFuncDesc expr, PredicateLeaf.Type type, int position) { List children = expr.getChildren(); - Object child = children.get(position); - if (child instanceof ExprNodeConstantDesc) { - return boxLiteral((ExprNodeConstantDesc) child, type); - } - return null; + ExprNodeDesc child = children.get(position); + return getLiteral(conf, child, type); } private static Object[] getLiteralList(ExprNodeGenericFuncDesc expr, @@ -272,16 +286,16 @@ private void createLeaf(PredicateLeaf.Operator operator, builder.isNull(columnName, type); break; case EQUALS: - builder.equals(columnName, type, findLiteral(expression, type)); + builder.equals(columnName, type, findLiteral(conf, expression, type)); break; case NULL_SAFE_EQUALS: - builder.nullSafeEquals(columnName, type, findLiteral(expression, type)); + builder.nullSafeEquals(columnName, type, findLiteral(conf, expression, type)); break; case LESS_THAN: - builder.lessThan(columnName, type, findLiteral(expression, type)); + builder.lessThan(columnName, type, findLiteral(conf, expression, type)); break; case LESS_THAN_EQUALS: - builder.lessThanEquals(columnName, type, findLiteral(expression, type)); + builder.lessThanEquals(columnName, type, findLiteral(conf, expression, type)); break; case IN: builder.in(columnName, type, @@ -289,8 +303,8 @@ private void createLeaf(PredicateLeaf.Operator operator, break; case BETWEEN: builder.between(columnName, type, - getLiteral(expression, type, variable + 1), - getLiteral(expression, type, variable + 2)); + getLiteral(conf, expression, type, variable + 1), + getLiteral(conf, expression, type, variable + 2)); break; } } catch (Exception e) { @@ -425,8 +439,8 @@ private void parse(ExprNodeDesc expression) { public static final String SARG_PUSHDOWN = "sarg.pushdown"; - public static SearchArgument create(ExprNodeGenericFuncDesc expression) { - return new ConvertAstToSearchArg(expression).buildSearchArgument(); + public static SearchArgument create(Configuration conf, ExprNodeGenericFuncDesc expression) { + return new ConvertAstToSearchArg(conf, expression).buildSearchArgument(); } @@ -445,7 +459,7 @@ public static SearchArgument create(byte[] kryoBytes) { public static SearchArgument createFromConf(Configuration conf) { String sargString; if ((sargString = conf.get(TableScanDesc.FILTER_EXPR_CONF_STR)) != null) { - return create(SerializationUtilities.deserializeExpression(sargString)); + return create(conf, SerializationUtilities.deserializeExpression(sargString)); } else if ((sargString = conf.get(SARG_PUSHDOWN)) != null) { return create(sargString); } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/DynamicPartitionPruningOptimization.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/DynamicPartitionPruningOptimization.java index 26fcc45..c8691e8 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/DynamicPartitionPruningOptimization.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/DynamicPartitionPruningOptimization.java @@ -28,13 +28,8 @@ import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.conf.HiveConf.ConfVars; -import org.apache.hadoop.hive.ql.exec.FilterOperator; -import org.apache.hadoop.hive.ql.exec.GroupByOperator; -import org.apache.hadoop.hive.ql.exec.Operator; -import org.apache.hadoop.hive.ql.exec.OperatorFactory; -import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator; -import org.apache.hadoop.hive.ql.exec.SelectOperator; -import org.apache.hadoop.hive.ql.exec.TableScanOperator; +import org.apache.hadoop.hive.ql.exec.*; +import org.apache.hadoop.hive.ql.io.AcidUtils.Operation; import org.apache.hadoop.hive.ql.lib.DefaultGraphWalker; import org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher; import org.apache.hadoop.hive.ql.lib.Dispatcher; @@ -50,20 +45,19 @@ import org.apache.hadoop.hive.ql.parse.OptimizeTezProcContext; import org.apache.hadoop.hive.ql.parse.ParseContext; import org.apache.hadoop.hive.ql.parse.PrunedPartitionList; +import org.apache.hadoop.hive.ql.parse.RuntimeValuesInfo; +import org.apache.hadoop.hive.ql.parse.SemanticAnalyzer; import org.apache.hadoop.hive.ql.parse.SemanticException; import org.apache.hadoop.hive.ql.parse.spark.OptimizeSparkProcContext; -import org.apache.hadoop.hive.ql.plan.AggregationDesc; -import org.apache.hadoop.hive.ql.plan.DynamicPruningEventDesc; -import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc; -import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc; -import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; -import org.apache.hadoop.hive.ql.plan.ExprNodeDescUtils; -import org.apache.hadoop.hive.ql.plan.ExprNodeDynamicListDesc; -import org.apache.hadoop.hive.ql.plan.FilterDesc; -import org.apache.hadoop.hive.ql.plan.GroupByDesc; -import org.apache.hadoop.hive.ql.plan.OperatorDesc; -import org.apache.hadoop.hive.ql.plan.PlanUtils; -import org.apache.hadoop.hive.ql.plan.SelectDesc; +import org.apache.hadoop.hive.ql.plan.*; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFBloomFilter.GenericUDAFBloomFilterEvaluator; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator.Mode; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; +import org.apache.hadoop.yarn.api.protocolrecords.GetNewApplicationRequest; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -148,15 +142,13 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, Obje FilterOperator filter = (FilterOperator) nd; FilterDesc desc = filter.getConf(); - TableScanOperator ts = null; - if (!parseContext.getConf().getBoolVar(ConfVars.TEZ_DYNAMIC_PARTITION_PRUNING) && !parseContext.getConf().getBoolVar(ConfVars.SPARK_DYNAMIC_PARTITION_PRUNING)) { // nothing to do when the optimization is off return null; } - DynamicPartitionPrunerContext removerContext = new DynamicPartitionPrunerContext(); + TableScanOperator ts = null; if (filter.getParentOperators().size() == 1 && filter.getParentOperators().get(0) instanceof TableScanOperator) { @@ -169,14 +161,32 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, Obje LOG.debug("TableScan: " + ts); } + DynamicPartitionPrunerContext removerContext = new DynamicPartitionPrunerContext(); + // collect the dynamic pruning conditions removerContext.dynLists.clear(); collectDynamicPruningConditions(desc.getPredicate(), removerContext); + if (ts == null) { + // Replace the synthetic predicate with true and bail out + for (DynamicListContext ctx : removerContext) { + ExprNodeDesc constNode = + new ExprNodeConstantDesc(ctx.parent.getTypeInfo(), true); + replaceExprNode(ctx, desc, constNode); + } + return false; + } + + final boolean semiJoin = parseContext.getConf().getBoolVar(ConfVars.TEZ_DYNAMIC_SEMIJOIN_REDUCTION); + for (DynamicListContext ctx : removerContext) { String column = ExprNodeDescUtils.extractColName(ctx.parent); + boolean semiJoinAttempted = false; + + if (column != null) { + // Need unique IDs to refer to each min/max key value in the DynamicValueRegistry + String keyBaseAlias = ""; - if (ts != null && column != null) { Table table = ts.getConf().getTableMetadata(); if (table != null && table.isPartitionKey(column)) { @@ -203,20 +213,56 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, Obje } } else { LOG.debug("Column " + column + " is not a partition column"); + if (semiJoin && ts.getConf().getFilterExpr() != null) { + LOG.debug("Initiate semijoin reduction for " + column); + // Get the table name from which the min-max values will come. + Operator op = ctx.generator; + while (!(op == null || op instanceof TableScanOperator)) { + op = op.getParentOperators().get(0); + } + String tableAlias = (op == null ? "" : ((TableScanOperator) op).getConf().getAlias()); + keyBaseAlias = ctx.generator.getOperatorId() + "_" + tableAlias + "_" + column; + + semiJoinAttempted = generateSemiJoinOperatorPlan(ctx, parseContext, ts, keyBaseAlias); + } } - } - // we always remove the condition by replacing it with "true" - ExprNodeDesc constNode = new ExprNodeConstantDesc(ctx.parent.getTypeInfo(), true); - if (ctx.grandParent == null) { - desc.setPredicate(constNode); + // If semijoin is attempted then replace the condition with a min-max filter + // and bloom filter else, + // we always remove the condition by replacing it with "true" + if (semiJoinAttempted) { + List betweenArgs = new ArrayList(); + betweenArgs.add(new ExprNodeConstantDesc(Boolean.FALSE)); // Do not invert between result + // add column expression here + betweenArgs.add(ctx.parent.getChildren().get(0)); + betweenArgs.add(new ExprNodeDynamicValueDesc(new DynamicValue(keyBaseAlias + "_min", ctx.desc.getTypeInfo()))); + betweenArgs.add(new ExprNodeDynamicValueDesc(new DynamicValue(keyBaseAlias + "_max", ctx.desc.getTypeInfo()))); + ExprNodeDesc betweenNode = ExprNodeGenericFuncDesc.newInstance( + FunctionRegistry.getFunctionInfo("between").getGenericUDF(), betweenArgs); + replaceExprNode(ctx, desc, betweenNode); + // add column expression for bloom filter + List bloomFilterArgs = new ArrayList(); + bloomFilterArgs.add(ctx.parent.getChildren().get(0)); + bloomFilterArgs.add(new ExprNodeDynamicValueDesc( + new DynamicValue(keyBaseAlias + "_bloom_filter", + TypeInfoFactory.binaryTypeInfo))); + ExprNodeDesc bloomFilterNode = ExprNodeGenericFuncDesc.newInstance( + FunctionRegistry.getFunctionInfo("in_bloom_filter"). + getGenericUDF(), bloomFilterArgs); + // ctx may not have the grandparent but it is set in filterDesc by now. + ExprNodeDesc grandParent = ctx.grandParent == null ? + desc.getPredicate() : ctx.grandParent; + grandParent.getChildren().add(bloomFilterNode); + } else { + ExprNodeDesc replaceNode = new ExprNodeConstantDesc(ctx.parent.getTypeInfo(), true); + replaceExprNode(ctx, desc, replaceNode); + } } else { - int i = ctx.grandParent.getChildren().indexOf(ctx.parent); - ctx.grandParent.getChildren().remove(i); - ctx.grandParent.getChildren().add(i, constNode); + ExprNodeDesc constNode = + new ExprNodeConstantDesc(ctx.parent.getTypeInfo(), true); + replaceExprNode(ctx, desc, constNode); } } - // if we pushed the predicate into the table scan we need to remove the // synthetic conditions there. cleanTableScanFilters(ts); @@ -224,6 +270,16 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, Obje return false; } + private void replaceExprNode(DynamicListContext ctx, FilterDesc desc, ExprNodeDesc node) { + if (ctx.grandParent == null) { + desc.setPredicate(node); + } else { + int i = ctx.grandParent.getChildren().indexOf(ctx.parent); + ctx.grandParent.getChildren().remove(i); + ctx.grandParent.getChildren().add(i, node); + } + } + private void cleanTableScanFilters(TableScanOperator ts) throws SemanticException { if (ts == null || ts.getConf() == null || ts.getConf().getFilterExpr() == null) { @@ -327,6 +383,228 @@ private void generateEventOperatorPlan(DynamicListContext ctx, ParseContext pars } } + // Generates plan for min/max when dynamic partition pruning is ruled out. + private boolean generateSemiJoinOperatorPlan(DynamicListContext ctx, ParseContext parseContext, + TableScanOperator ts, String keyBaseAlias) throws SemanticException { + + // we will put a fork in the plan at the source of the reduce sink + Operator parentOfRS = ctx.generator.getParentOperators().get(0); + + // we need the expr that generated the key of the reduce sink + ExprNodeDesc key = ctx.generator.getConf().getKeyCols().get(ctx.desc.getKeyIndex()); + + if (parentOfRS instanceof SelectOperator) { + // Make sure the semijoin branch is not on parition column. + String internalColName = null; + ExprNodeDesc exprNodeDesc = key; + // Find the ExprNodeColumnDesc + while (!(exprNodeDesc instanceof ExprNodeColumnDesc)) { + exprNodeDesc = exprNodeDesc.getChildren().get(0); + } + internalColName = ((ExprNodeColumnDesc) exprNodeDesc).getColumn(); + + ExprNodeColumnDesc colExpr = ((ExprNodeColumnDesc)(parentOfRS. + getColumnExprMap().get(internalColName))); + String colName = ExprNodeDescUtils.extractColName(colExpr); + + // Fetch the TableScan Operator. + Operator op = parentOfRS.getParentOperators().get(0); + while (op != null && !(op instanceof TableScanOperator)) { + op = op.getParentOperators().get(0); + } + assert op != null; + + Table table = ((TableScanOperator) op).getConf().getTableMetadata(); + if (table.isPartitionKey(colName)) { + // The column is partition column, skip the optimization. + return false; + } + } + List keyExprs = new ArrayList(); + keyExprs.add(key); + + // group by requires "ArrayList", don't ask. + ArrayList outputNames = new ArrayList(); + outputNames.add(HiveConf.getColumnInternalName(0)); + + // project the relevant key column + SelectDesc select = new SelectDesc(keyExprs, outputNames); + SelectOperator selectOp = + (SelectOperator) OperatorFactory.getAndMakeChild(select, + new RowSchema(parentOfRS.getSchema()), parentOfRS); + + // do a group by to aggregate min,max and bloom filter. + float groupByMemoryUsage = + HiveConf.getFloatVar(parseContext.getConf(), HiveConf.ConfVars.HIVEMAPAGGRHASHMEMORY); + float memoryThreshold = + HiveConf.getFloatVar(parseContext.getConf(), + HiveConf.ConfVars.HIVEMAPAGGRMEMORYTHRESHOLD); + + ArrayList groupByExprs = new ArrayList(); + + // Add min/max and bloom filter aggregations + List aggFnOIs = new ArrayList(); + aggFnOIs.add(key.getWritableObjectInspector()); + ArrayList params = new ArrayList(); + params.add( + new ExprNodeColumnDesc(key.getTypeInfo(), outputNames.get(0), + "", false)); + + ArrayList aggs = new ArrayList(); + try { + AggregationDesc min = new AggregationDesc("min", + FunctionRegistry.getGenericUDAFEvaluator("min", aggFnOIs, false, false), + params, false, Mode.PARTIAL1); + AggregationDesc max = new AggregationDesc("max", + FunctionRegistry.getGenericUDAFEvaluator("max", aggFnOIs, false, false), + params, false, Mode.PARTIAL1); + AggregationDesc bloomFilter = new AggregationDesc("bloom_filter", + FunctionRegistry.getGenericUDAFEvaluator("bloom_filter", aggFnOIs, false, false), + params, false, Mode.PARTIAL1); + GenericUDAFBloomFilterEvaluator bloomFilterEval = (GenericUDAFBloomFilterEvaluator) bloomFilter.getGenericUDAFEvaluator(); + bloomFilterEval.setSourceOperator(selectOp); + bloomFilterEval.setMaxEntries(parseContext.getConf().getLongVar(ConfVars.TEZ_MAX_BLOOM_FILTER_ENTRIES)); + bloomFilter.setGenericUDAFWritableEvaluator(bloomFilterEval); + aggs.add(min); + aggs.add(max); + aggs.add(bloomFilter); + } catch (SemanticException e) { + LOG.error("Error creating min/max aggregations on key", e); + throw new IllegalStateException("Error creating min/max aggregations on key", e); + } + + // Create the Group by Operator + ArrayList gbOutputNames = new ArrayList(); + gbOutputNames.add(SemanticAnalyzer.getColumnInternalName(0)); + gbOutputNames.add(SemanticAnalyzer.getColumnInternalName(1)); + gbOutputNames.add(SemanticAnalyzer.getColumnInternalName(2)); + GroupByDesc groupBy = new GroupByDesc(GroupByDesc.Mode.HASH, + gbOutputNames, new ArrayList(), aggs, false, + groupByMemoryUsage, memoryThreshold, null, false, 0, false); + + ArrayList groupbyColInfos = new ArrayList(); + groupbyColInfos.add(new ColumnInfo(gbOutputNames.get(0), key.getTypeInfo(), "", false)); + groupbyColInfos.add(new ColumnInfo(gbOutputNames.get(1), key.getTypeInfo(), "", false)); + groupbyColInfos.add(new ColumnInfo(gbOutputNames.get(2), key.getTypeInfo(), "", false)); + + GroupByOperator groupByOp = (GroupByOperator)OperatorFactory.getAndMakeChild( + groupBy, new RowSchema(groupbyColInfos), selectOp); + + groupByOp.setColumnExprMap(new HashMap()); + + // Get the column names of the aggregations for reduce sink + int colPos = 0; + ArrayList rsValueCols = new ArrayList(); + for (int i = 0; i < aggs.size() - 1; i++) { + ExprNodeColumnDesc colExpr = new ExprNodeColumnDesc(key.getTypeInfo(), + gbOutputNames.get(colPos++), "", false); + rsValueCols.add(colExpr); + } + + // Bloom Filter uses binary + ExprNodeColumnDesc colExpr = new ExprNodeColumnDesc(TypeInfoFactory.binaryTypeInfo, + gbOutputNames.get(colPos++), "", false); + rsValueCols.add(colExpr); + + // Create the reduce sink operator + ReduceSinkDesc rsDesc = PlanUtils.getReduceSinkDesc( + new ArrayList(), rsValueCols, gbOutputNames, false, + -1, 0, 1, Operation.NOT_ACID); + ReduceSinkOperator rsOp = (ReduceSinkOperator)OperatorFactory.getAndMakeChild( + rsDesc, new RowSchema(groupByOp.getSchema()), groupByOp); + Map columnExprMap = new HashMap(); + rsOp.setColumnExprMap(columnExprMap); + + // Create the final Group By Operator + ArrayList aggsFinal = new ArrayList(); + try { + List minFinalFnOIs = new ArrayList(); + List maxFinalFnOIs = new ArrayList(); + List bloomFilterFinalFnOIs = new ArrayList(); + ArrayList minFinalParams = new ArrayList(); + ArrayList maxFinalParams = new ArrayList(); + ArrayList bloomFilterFinalParams = new ArrayList(); + // Use the expressions from Reduce Sink. + minFinalFnOIs.add(rsValueCols.get(0).getWritableObjectInspector()); + maxFinalFnOIs.add(rsValueCols.get(1).getWritableObjectInspector()); + bloomFilterFinalFnOIs.add(rsValueCols.get(2).getWritableObjectInspector()); + // Coming from a ReduceSink the aggregations would be in the form VALUE._col0, VALUE._col1 + minFinalParams.add( + new ExprNodeColumnDesc( + rsValueCols.get(0).getTypeInfo(), + Utilities.ReduceField.VALUE + "." + + gbOutputNames.get(0), "", false)); + maxFinalParams.add( + new ExprNodeColumnDesc( + rsValueCols.get(1).getTypeInfo(), + Utilities.ReduceField.VALUE + "." + + gbOutputNames.get(1), "", false)); + bloomFilterFinalParams.add( + new ExprNodeColumnDesc( + rsValueCols.get(2).getTypeInfo(), + Utilities.ReduceField.VALUE + "." + + gbOutputNames.get(2), "", false)); + + AggregationDesc min = new AggregationDesc("min", + FunctionRegistry.getGenericUDAFEvaluator("min", minFinalFnOIs, + false, false), + minFinalParams, false, Mode.FINAL); + AggregationDesc max = new AggregationDesc("max", + FunctionRegistry.getGenericUDAFEvaluator("max", maxFinalFnOIs, + false, false), + maxFinalParams, false, Mode.FINAL); + AggregationDesc bloomFilter = new AggregationDesc("bloom_filter", + FunctionRegistry.getGenericUDAFEvaluator("bloom_filter", bloomFilterFinalFnOIs, + false, false), + bloomFilterFinalParams, false, Mode.FINAL); + GenericUDAFBloomFilterEvaluator bloomFilterEval = (GenericUDAFBloomFilterEvaluator) bloomFilter.getGenericUDAFEvaluator(); + bloomFilterEval.setSourceOperator(selectOp); + bloomFilterEval.setMaxEntries(parseContext.getConf().getLongVar(ConfVars.TEZ_MAX_BLOOM_FILTER_ENTRIES)); + bloomFilter.setGenericUDAFWritableEvaluator(bloomFilterEval); + + aggsFinal.add(min); + aggsFinal.add(max); + aggsFinal.add(bloomFilter); + } catch (SemanticException e) { + LOG.error("Error creating min/max aggregations on key", e); + throw new IllegalStateException("Error creating min/max aggregations on key", e); + } + + GroupByDesc groupByDescFinal = new GroupByDesc(GroupByDesc.Mode.FINAL, + gbOutputNames, new ArrayList(), aggsFinal, false, + groupByMemoryUsage, memoryThreshold, null, false, 0, false); + GroupByOperator groupByOpFinal = (GroupByOperator)OperatorFactory.getAndMakeChild( + groupByDescFinal, new RowSchema(rsOp.getSchema()), rsOp); + groupByOpFinal.setColumnExprMap(new HashMap()); + + // Create the final Reduce Sink Operator + ReduceSinkDesc rsDescFinal = PlanUtils.getReduceSinkDesc( + new ArrayList(), rsValueCols, gbOutputNames, false, + -1, 0, 1, Operation.NOT_ACID); + ReduceSinkOperator rsOpFinal = (ReduceSinkOperator)OperatorFactory.getAndMakeChild( + rsDescFinal, new RowSchema(groupByOpFinal.getSchema()), groupByOpFinal); + rsOpFinal.setColumnExprMap(columnExprMap); + + LOG.debug("DynamicMinMaxPushdown: Saving RS to TS mapping: " + rsOpFinal + ": " + ts); + parseContext.getRsOpToTsOpMap().put(rsOpFinal, ts); + + // Save the info that is required at query time to resolve dynamic/runtime values. + RuntimeValuesInfo runtimeValuesInfo = new RuntimeValuesInfo(); + TableDesc rsFinalTableDesc = PlanUtils.getReduceValueTableDesc( + PlanUtils.getFieldSchemasFromColumnList(rsValueCols, "_col")); + List dynamicValueIDs = new ArrayList(); + dynamicValueIDs.add(keyBaseAlias + "_min"); + dynamicValueIDs.add(keyBaseAlias + "_max"); + dynamicValueIDs.add(keyBaseAlias + "_bloom_filter"); + + runtimeValuesInfo.setTableDesc(rsFinalTableDesc); + runtimeValuesInfo.setDynamicValueIDs(dynamicValueIDs); + runtimeValuesInfo.setColExprs(rsValueCols); + parseContext.getRsToRuntimeValuesInfoMap().put(rsOpFinal, runtimeValuesInfo); + + return true; + } + private Map collectDynamicPruningConditions(ExprNodeDesc pred, NodeProcessorCtx ctx) throws SemanticException { diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/FixedBucketPruningOptimizer.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/FixedBucketPruningOptimizer.java index 9e9beb0..b853a06 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/FixedBucketPruningOptimizer.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/FixedBucketPruningOptimizer.java @@ -135,7 +135,7 @@ protected void generatePredicate(NodeProcessorCtx procCtx, return; } // the sargs are closely tied to hive.optimize.index.filter - SearchArgument sarg = ConvertAstToSearchArg.create(filter); + SearchArgument sarg = ConvertAstToSearchArg.create(ctxt.pctx.getConf(), filter); if (sarg == null) { return; } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/RedundantDynamicPruningConditionsRemoval.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/RedundantDynamicPruningConditionsRemoval.java index d9ce017..b8a60f9 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/RedundantDynamicPruningConditionsRemoval.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/RedundantDynamicPruningConditionsRemoval.java @@ -24,6 +24,7 @@ import java.util.Stack; import org.apache.calcite.util.Pair; +import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.ql.exec.FilterOperator; import org.apache.hadoop.hive.ql.exec.FunctionRegistry; import org.apache.hadoop.hive.ql.exec.TableScanOperator; @@ -75,16 +76,19 @@ */ @Override public ParseContext transform(ParseContext pctx) throws SemanticException { - Map opRules = new LinkedHashMap(); - opRules.put(new RuleRegExp("R1", TableScanOperator.getOperatorName() + "%" + - FilterOperator.getOperatorName() + "%"), new FilterTransformer()); - - Dispatcher disp = new DefaultRuleDispatcher(null, opRules, null); - GraphWalker ogw = new DefaultGraphWalker(disp); - - List topNodes = new ArrayList(); - topNodes.addAll(pctx.getTopOps().values()); - ogw.startWalking(topNodes, null); + // Make sure semijoin is not enabled. If it is, then do not remove the dynamic partition pruning predicates. + if (!pctx.getConf().getBoolVar(HiveConf.ConfVars.TEZ_DYNAMIC_SEMIJOIN_REDUCTION)) { + Map opRules = new LinkedHashMap(); + opRules.put(new RuleRegExp("R1", TableScanOperator.getOperatorName() + "%" + + FilterOperator.getOperatorName() + "%"), new FilterTransformer()); + + Dispatcher disp = new DefaultRuleDispatcher(null, opRules, null); + GraphWalker ogw = new DefaultGraphWalker(disp); + + List topNodes = new ArrayList(); + topNodes.addAll(pctx.getTopOps().values()); + ogw.startWalking(topNodes, null); + } return pctx; } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java index aa1e509..61f1374 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java @@ -51,41 +51,10 @@ import org.apache.hadoop.hive.ql.metadata.Table; import org.apache.hadoop.hive.ql.parse.PrunedPartitionList; import org.apache.hadoop.hive.ql.parse.SemanticException; -import org.apache.hadoop.hive.ql.plan.AggregationDesc; -import org.apache.hadoop.hive.ql.plan.ColStatistics; -import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc; -import org.apache.hadoop.hive.ql.plan.ExprNodeColumnListDesc; -import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc; -import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; +import org.apache.hadoop.hive.ql.plan.*; import org.apache.hadoop.hive.ql.plan.ExprNodeDesc.ExprNodeDescEqualityWrapper; -import org.apache.hadoop.hive.ql.plan.ExprNodeDescUtils; -import org.apache.hadoop.hive.ql.plan.ExprNodeDynamicListDesc; -import org.apache.hadoop.hive.ql.plan.ExprNodeFieldDesc; -import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc; -import org.apache.hadoop.hive.ql.plan.GroupByDesc; -import org.apache.hadoop.hive.ql.plan.JoinCondDesc; -import org.apache.hadoop.hive.ql.plan.JoinDesc; -import org.apache.hadoop.hive.ql.plan.MapJoinDesc; -import org.apache.hadoop.hive.ql.plan.OperatorDesc; -import org.apache.hadoop.hive.ql.plan.Statistics; import org.apache.hadoop.hive.ql.stats.StatsUtils; -import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator; -import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; -import org.apache.hadoop.hive.ql.udf.generic.GenericUDFBetween; -import org.apache.hadoop.hive.ql.udf.generic.GenericUDFIn; -import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPAnd; -import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqual; -import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqualNS; -import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqualOrGreaterThan; -import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqualOrLessThan; -import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPGreaterThan; -import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPLessThan; -import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPNot; -import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPNotEqual; -import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPNotNull; -import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPNull; -import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPOr; -import org.apache.hadoop.hive.ql.udf.generic.GenericUDFStruct; +import org.apache.hadoop.hive.ql.udf.generic.*; import org.apache.hadoop.hive.serde.serdeConstants; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo; @@ -494,6 +463,12 @@ private long evaluateBetweenExpr(Statistics stats, ExprNodeDesc pred, AnnotateSt final ExprNodeDesc leftExpression = fd.getChildren().get(2); // left expression final ExprNodeDesc rightExpression = fd.getChildren().get(3); // right expression + // Short circuit and return the current number of rows if this is a + // synthetic predicate with dynamic values + if (leftExpression instanceof ExprNodeDynamicValueDesc) { + return stats.getNumRows(); + } + // We transform the BETWEEN clause to AND clause (with NOT on top in invert is true). // This is more straightforward, as the evaluateExpression method will deal with // generating the final row count relying on the basic comparator evaluation methods @@ -888,18 +863,29 @@ private long evaluateChildExpr(Statistics stats, ExprNodeDesc child, } else if (udf instanceof GenericUDFOPNotEqual) { return numRows; } else if (udf instanceof GenericUDFOPEqualOrGreaterThan - || udf instanceof GenericUDFOPEqualOrLessThan - || udf instanceof GenericUDFOPGreaterThan - || udf instanceof GenericUDFOPLessThan) { + || udf instanceof GenericUDFOPEqualOrLessThan + || udf instanceof GenericUDFOPGreaterThan + || udf instanceof GenericUDFOPLessThan) { return evaluateComparator(stats, genFunc); } else if (udf instanceof GenericUDFOPNotNull) { - return evaluateNotNullExpr(stats, genFunc); + return evaluateNotNullExpr(stats, genFunc); } else if (udf instanceof GenericUDFOPNull) { return evaluateColEqualsNullExpr(stats, genFunc); } else if (udf instanceof GenericUDFOPAnd || udf instanceof GenericUDFOPOr - || udf instanceof GenericUDFIn || udf instanceof GenericUDFBetween - || udf instanceof GenericUDFOPNot) { + || udf instanceof GenericUDFIn || udf instanceof GenericUDFBetween + || udf instanceof GenericUDFOPNot) { return evaluateExpression(stats, genFunc, aspCtx, neededCols, fop, evaluatedRowCount); + } else if (udf instanceof GenericUDFInBloomFilter) { + if (genFunc.getChildren().get(1) instanceof ExprNodeDynamicValueDesc) { + // Synthetic predicates from semijoin opt should not affect stats. + return numRows; + } + } + } else if (child instanceof ExprNodeConstantDesc) { + if (Boolean.FALSE.equals(((ExprNodeConstantDesc) child).getValue())) { + return 0; + } else { + return stats.getNumRows(); } } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/GenTezUtils.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/GenTezUtils.java index 3a111aa..6141391 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/parse/GenTezUtils.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/GenTezUtils.java @@ -20,13 +20,7 @@ import static org.apache.hadoop.hive.ql.plan.ReduceSinkDesc.ReducerTraits.AUTOPARALLEL; -import java.util.ArrayList; -import java.util.Deque; -import java.util.HashSet; -import java.util.Iterator; -import java.util.LinkedList; -import java.util.List; -import java.util.Set; +import java.util.*; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.conf.HiveConf; @@ -42,18 +36,13 @@ import org.apache.hadoop.hive.ql.exec.TableScanOperator; import org.apache.hadoop.hive.ql.exec.UnionOperator; import org.apache.hadoop.hive.ql.exec.Utilities; +import org.apache.hadoop.hive.ql.lib.*; import org.apache.hadoop.hive.ql.optimizer.GenMapRedUtils; -import org.apache.hadoop.hive.ql.plan.BaseWork; -import org.apache.hadoop.hive.ql.plan.DynamicPruningEventDesc; -import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; -import org.apache.hadoop.hive.ql.plan.FileSinkDesc; -import org.apache.hadoop.hive.ql.plan.MapWork; -import org.apache.hadoop.hive.ql.plan.ReduceWork; -import org.apache.hadoop.hive.ql.plan.TableDesc; -import org.apache.hadoop.hive.ql.plan.TezEdgeProperty; +import org.apache.hadoop.hive.ql.plan.*; import org.apache.hadoop.hive.ql.plan.TezEdgeProperty.EdgeType; -import org.apache.hadoop.hive.ql.plan.TezWork; -import org.apache.hadoop.hive.ql.plan.UnionWork; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDFBetween; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDFInBloomFilter; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -269,6 +258,15 @@ public static void removeUnionOperators(GenTezProcContext context, BaseWork work ((DynamicPruningEventDesc) event.getConf()).setTableScan((TableScanOperator) newRoot); } } + // This TableScanOperator could be part of semijoin optimization. + Map rsOpToTsOpMap = + context.parseContext.getRsOpToTsOpMap(); + for (ReduceSinkOperator rs : rsOpToTsOpMap.keySet()) { + if (rsOpToTsOpMap.get(rs) == orig) { + rsOpToTsOpMap.put(rs, (TableScanOperator) newRoot); + break; + } + } } context.rootToWorkMap.remove(orig); context.rootToWorkMap.put(newRoot, work); @@ -479,7 +477,7 @@ private static void findRoots(Operator op, List> ops) { * Remove an operator branch. When we see a fork, we know it's time to do the removal. * @param event the leaf node of which branch to be removed */ - public static void removeBranch(AppMasterEventOperator event) { + public static void removeBranch(Operator event) { Operator child = event; Operator curr = event; @@ -511,4 +509,140 @@ public static EdgeType determineEdgeType(BaseWork preceedingWork, BaseWork follo } return EdgeType.SIMPLE_EDGE; } + + public static void processDynamicMinMaxPushDownOperator( + GenTezProcContext procCtx, RuntimeValuesInfo runtimeValuesInfo, + ReduceSinkOperator rs) + throws SemanticException { + TableScanOperator ts = procCtx.parseContext.getRsOpToTsOpMap().get(rs); + + List rsWorkList = procCtx.childToWorkMap.get(rs); + if (ts == null || rsWorkList == null) { + // This happens when the ReduceSink's edge has been removed by cycle + // detection logic. Nothing to do here. + return; + } + LOG.debug("ResduceSink " + rs + " to TableScan " + ts); + + if (rsWorkList.size() != 1) { + StringBuilder sb = new StringBuilder(); + for (BaseWork curWork : rsWorkList) { + if ( sb.length() > 0) { + sb.append(", "); + } + sb.append(curWork.getName()); + } + throw new SemanticException(rs + " belongs to multiple BaseWorks: " + sb.toString()); + } + + BaseWork parentWork = rsWorkList.get(0); + BaseWork childWork = procCtx.rootToWorkMap.get(ts); + + // Connect parent/child work with a brodacast edge. + LOG.debug("Connecting Baswork - " + parentWork.getName() + " to " + childWork.getName()); + TezEdgeProperty edgeProperty = new TezEdgeProperty(EdgeType.BROADCAST_EDGE); + TezWork tezWork = procCtx.currentTask.getWork(); + tezWork.connect(parentWork, childWork, edgeProperty); + + // Set output names in ReduceSink + rs.getConf().setOutputName(childWork.getName()); + + // Set up the dynamic values in the childWork. + RuntimeValuesInfo childRuntimeValuesInfo = + new RuntimeValuesInfo(); + childRuntimeValuesInfo.setTableDesc(runtimeValuesInfo.getTableDesc()); + childRuntimeValuesInfo.setDynamicValueIDs(runtimeValuesInfo.getDynamicValueIDs()); + childRuntimeValuesInfo.setColExprs(runtimeValuesInfo.getColExprs()); + childWork.setInputSourceToRuntimeValuesInfo( + parentWork.getName(), childRuntimeValuesInfo); + } + + // Functionality to remove semi-join optimization + public static void removeSemiJoinOperator(ParseContext context, + ReduceSinkOperator rs, + TableScanOperator ts) throws SemanticException{ + // Cleanup the synthetic predicate in the tablescan operator by + // replacing it with "true" + LOG.debug("Removing ReduceSink " + rs + " and TableScan " + ts); + ExprNodeDesc constNode = new ExprNodeConstantDesc( + TypeInfoFactory.booleanTypeInfo, Boolean.TRUE); + DynamicValuePredicateContext filterDynamicValuePredicatesCollection = + new DynamicValuePredicateContext(); + collectDynamicValuePredicates(ts.getConf().getFilterExpr(), + filterDynamicValuePredicatesCollection); + for (ExprNodeDesc nodeToRemove : filterDynamicValuePredicatesCollection + .childParentMapping.keySet()) { + // Find out if this synthetic predicate belongs to the current cycle + boolean skip = true; + for (ExprNodeDesc expr : nodeToRemove.getChildren()) { + if (expr instanceof ExprNodeDynamicValueDesc ) { + String dynamicValueIdFromExpr = ((ExprNodeDynamicValueDesc) expr) + .getDynamicValue().getId(); + List dynamicValueIdsFromMap = context. + getRsToRuntimeValuesInfoMap().get(rs).getDynamicValueIDs(); + for (String dynamicValueIdFromMap : dynamicValueIdsFromMap) { + if (dynamicValueIdFromExpr.equals(dynamicValueIdFromMap)) { + // Intended predicate to be removed + skip = false; + break; + } + } + } + } + if (!skip) { + ExprNodeDesc nodeParent = filterDynamicValuePredicatesCollection + .childParentMapping.get(nodeToRemove); + if (nodeParent == null) { + // This was the only predicate, set filter expression to null + ts.getConf().setFilterExpr(null); + } else { + int i = nodeParent.getChildren().indexOf(nodeToRemove); + nodeParent.getChildren().remove(i); + nodeParent.getChildren().add(i, constNode); + } + // skip the rest of the predicates + skip = true; + } + } + context.getRsOpToTsOpMap().remove(rs); + } + + private static class DynamicValuePredicateContext implements NodeProcessorCtx { + HashMap childParentMapping = new HashMap(); + } + + private static class DynamicValuePredicateProc implements NodeProcessor { + + @Override + public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, + Object... nodeOutputs) throws SemanticException { + DynamicValuePredicateContext ctx = (DynamicValuePredicateContext) procCtx; + ExprNodeDesc parent = (ExprNodeDesc) stack.get(stack.size() - 2); + if (parent instanceof ExprNodeGenericFuncDesc) { + ExprNodeGenericFuncDesc parentFunc = (ExprNodeGenericFuncDesc) parent; + if (parentFunc.getGenericUDF() instanceof GenericUDFBetween || + parentFunc.getGenericUDF() instanceof GenericUDFInBloomFilter) { + ExprNodeDesc grandParent = stack.size() >= 3 ? + (ExprNodeDesc) stack.get(stack.size() - 3) : null; + ctx.childParentMapping.put(parentFunc, grandParent); + } + } + + return null; + } + } + + private static void collectDynamicValuePredicates(ExprNodeDesc pred, NodeProcessorCtx ctx) throws SemanticException { + // create a walker which walks the tree in a DFS manner while maintaining + // the operator stack. The dispatcher + // generates the plan from the operator tree + Map exprRules = new LinkedHashMap(); + exprRules.put(new RuleRegExp("R1", ExprNodeDynamicValueDesc.class.getName() + "%"), new DynamicValuePredicateProc()); + Dispatcher disp = new DefaultRuleDispatcher(null, exprRules, ctx); + GraphWalker egw = new DefaultGraphWalker(disp); + List startNodes = new ArrayList(); + startNodes.add(pred); + + egw.startWalking(startNodes, null); + } } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/ParseContext.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/ParseContext.java index 35f34da..3f9f76c 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/parse/ParseContext.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/ParseContext.java @@ -50,6 +50,7 @@ import org.apache.hadoop.hive.ql.optimizer.ppr.PartitionPruner; import org.apache.hadoop.hive.ql.optimizer.unionproc.UnionProcContext; import org.apache.hadoop.hive.ql.parse.BaseSemanticAnalyzer.AnalyzeRewriteContext; +import org.apache.hadoop.hive.ql.parse.RuntimeValuesInfo; import org.apache.hadoop.hive.ql.plan.CreateTableDesc; import org.apache.hadoop.hive.ql.plan.CreateViewDesc; import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; @@ -125,6 +126,11 @@ private boolean needViewColumnAuthorization; private Set acidFileSinks = Collections.emptySet(); + // Map to store mapping between reduce sink Operator and TS Operator for semijoin + private Map rsOpToTsOpMap = + new HashMap(); + private Map rsToRuntimeValuesInfo = + new HashMap(); public ParseContext() { } @@ -652,4 +658,19 @@ private static void getAllOps(List builder, Set visited, Ope } } + public void setRsToRuntimeValuesInfoMap(Map rsToRuntimeValuesInfo) { + this.rsToRuntimeValuesInfo = rsToRuntimeValuesInfo; + } + + public Map getRsToRuntimeValuesInfoMap() { + return rsToRuntimeValuesInfo; + } + + public void setRsOpToTsOpMap(Map rsOpToTsOpMap) { + this.rsOpToTsOpMap = rsOpToTsOpMap; + } + + public Map getRsOpToTsOpMap() { + return rsOpToTsOpMap; + } } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/RuntimeValuesInfo.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/RuntimeValuesInfo.java new file mode 100644 index 0000000..e1f78f7 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/RuntimeValuesInfo.java @@ -0,0 +1,44 @@ +package org.apache.hadoop.hive.ql.parse; + +import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; +import org.apache.hadoop.hive.ql.plan.TableDesc; + +import java.io.Serializable; +import java.util.List; + +/** + * Holds structures required for runtime values and mappings. + */ +public class RuntimeValuesInfo implements Serializable { + private static final long serialVersionUID = 1L; + + private TableDesc tableDesc; + private List dynamicValueIDs; + private List colExprs; + + // get-set methods + public TableDesc getTableDesc() { + return tableDesc; + } + + public void setTableDesc(TableDesc tableDesc) { + this.tableDesc = tableDesc; + } + + public List getDynamicValueIDs() { + return dynamicValueIDs; + } + + public void setDynamicValueIDs(List dynamicValueIDs) { + this.dynamicValueIDs = dynamicValueIDs; + } + + public List getColExprs() { + return colExprs; + } + + public void setColExprs(List colExprs) { + this.colExprs = colExprs; + } +} + diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/TaskCompiler.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/TaskCompiler.java index e8b003e..5f9ccc8 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/parse/TaskCompiler.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/TaskCompiler.java @@ -530,6 +530,9 @@ public ParseContext getParseContext(ParseContext pCtx, List inputs, runCycleAnalysisForPartitionPruning(procCtx, inputs, outputs); perfLogger.PerfLogEnd(this.getClass().getName(), PerfLogger.TEZ_COMPILER, "Run cycle analysis for partition pruning"); + perfLogger.PerfLogBegin(this.getClass().getName(), PerfLogger.TEZ_COMPILER); + // Remove semijoin optimization if it creates a cycle with mapside joins + removeSemiJoinCyclesDueToMapsideJoins(procCtx); + perfLogger.PerfLogEnd(this.getClass().getName(), PerfLogger.TEZ_COMPILER, "Remove semijoin optimizations if it creates a cycle with mapside join"); + + perfLogger.PerfLogBegin(this.getClass().getName(), PerfLogger.TEZ_COMPILER); + // Remove semijoin optimization if SMB join is created. + removeSemijoinOptimizationFromSMBJoins(procCtx); + perfLogger.PerfLogEnd(this.getClass().getName(), PerfLogger.TEZ_COMPILER, "Remove semijoin optimizations if needed"); + + perfLogger.PerfLogBegin(this.getClass().getName(), PerfLogger.TEZ_COMPILER); + // Remove bloomfilter if no stats generated + removeSemiJoinIfNoStats(procCtx); + perfLogger.PerfLogEnd(this.getClass().getName(), PerfLogger.TEZ_COMPILER, "Remove bloom filter optimizations if needed"); + + // need a new run of the constant folding because we might have created lots + // of "and true and true" conditions. + // Rather than run the full constant folding just need to shortcut AND/OR expressions + // involving constant true/false values. + if(procCtx.conf.getBoolVar(ConfVars.HIVEOPTCONSTANTPROPAGATION)) { + new ConstantPropagate(ConstantPropagateOption.SHORTCUT).transform(procCtx.parseContext); + } + } private void runCycleAnalysisForPartitionPruning(OptimizeTezProcContext procCtx, @@ -163,7 +153,7 @@ private void runCycleAnalysisForPartitionPruning(OptimizeTezProcContext procCtx, if (component.size() != 1) { LOG.info("Found cycle in operator plan..."); cycleFree = false; - removeEventOperator(component, procCtx); + removeCycleOperator(component, procCtx); break; } } @@ -171,29 +161,72 @@ private void runCycleAnalysisForPartitionPruning(OptimizeTezProcContext procCtx, } } - private void removeEventOperator(Set> component, OptimizeTezProcContext context) { - AppMasterEventOperator victim = null; + private void removeCycleOperator(Set> component, OptimizeTezProcContext context) throws SemanticException { + AppMasterEventOperator victimAM = null; + TableScanOperator victimTS = null; + ReduceSinkOperator victimRS = null; + for (Operator o : component) { + // Look for AppMasterEventOperator or ReduceSinkOperator if (o instanceof AppMasterEventOperator) { - if (victim == null - || o.getConf().getStatistics().getDataSize() < victim.getConf().getStatistics() + if (victimAM == null + || o.getStatistics().getDataSize() < victimAM.getStatistics() .getDataSize()) { - victim = (AppMasterEventOperator) o; + victimAM = (AppMasterEventOperator) o; } + } else if (o instanceof ReduceSinkOperator) { + TableScanOperator ts = context.parseContext.getRsOpToTsOpMap().get(o); + if (ts == null) { + continue; + } + // Sanity check + assert component.contains(ts); + + if (victimRS == null || + ts.getStatistics().getDataSize() < + victimTS.getStatistics().getDataSize()) { + victimRS = (ReduceSinkOperator) o; + victimTS = ts; + } + } + } + + // Always set the min/max optimization as victim. + Operator victim = victimRS; + + if (victimRS == null && victimAM != null ) { + victim = victimAM; + } else if (victimAM == null) { + // do nothing + } else { + // Cycle consists of atleast one dynamic partition pruning(DPP) + // optimization and atleast one min/max optimization. + // DPP is a better optimization unless it ends up scanning the + // bigger table for keys instead of the smaller table. + + // Get the parent TS of victimRS. + Operator op = victimRS; + while(!(op instanceof TableScanOperator)) { + op = op.getParentOperators().get(0); + } + if ((2 * op.getStatistics().getDataSize()) < + victimAM.getStatistics().getDataSize()) { + victim = victimAM; } } if (victim == null || - (!context.pruningOpsRemovedByPriorOpt.isEmpty() && - context.pruningOpsRemovedByPriorOpt.contains(victim))) { + (!context.pruningOpsRemovedByPriorOpt.isEmpty() && + context.pruningOpsRemovedByPriorOpt.contains(victim))) { return; } GenTezUtils.removeBranch(victim); - // at this point we've found the fork in the op pipeline that has the pruning as a child plan. - LOG.info("Disabling dynamic pruning for: " - + ((DynamicPruningEventDesc) victim.getConf()).getTableScan().toString() - + ". Needed to break cyclic dependency"); + + if (victim == victimRS) { + GenTezUtils.removeSemiJoinOperator(context.parseContext, victimRS, victimTS); + } + return; } // Tarjan's algo @@ -205,11 +238,11 @@ private void removeEventOperator(Set> component, OptimizeTezProcCont Map, Integer> indexes = new HashMap, Integer>(); Map, Integer> lowLinks = new HashMap, Integer>(); Stack> nodes = new Stack>(); - Set>> components = new HashSet>>(); + Set>> components = new LinkedHashSet>>(); for (Operator o : deque) { if (!indexes.containsKey(o)) { - connect(o, index, nodes, indexes, lowLinks, components); + connect(o, index, nodes, indexes, lowLinks, components, procCtx.parseContext); } } @@ -218,7 +251,7 @@ private void removeEventOperator(Set> component, OptimizeTezProcCont private void connect(Operator o, AtomicInteger index, Stack> nodes, Map, Integer> indexes, Map, Integer> lowLinks, - Set>> components) { + Set>> components, ParseContext parseContext) { indexes.put(o, index.get()); lowLinks.put(o, index.get()); @@ -232,13 +265,22 @@ private void connect(Operator o, AtomicInteger index, Stack> node TableScanOperator ts = ((DynamicPruningEventDesc) o.getConf()).getTableScan(); LOG.debug("Adding special edge: " + o.getName() + " --> " + ts.toString()); children.add(ts); + } else if (o instanceof ReduceSinkOperator){ + // min/max case + children = new ArrayList>(); + children.addAll(o.getChildOperators()); + TableScanOperator ts = parseContext.getRsOpToTsOpMap().get(o); + if (ts != null) { + LOG.debug("Adding special edge: " + o.getName() + " --> " + ts.toString()); + children.add(ts); + } } else { children = o.getChildOperators(); } for (Operator child : children) { if (!indexes.containsKey(child)) { - connect(child, index, nodes, indexes, lowLinks, components); + connect(child, index, nodes, indexes, lowLinks, components, parseContext); lowLinks.put(o, Math.min(lowLinks.get(o), lowLinks.get(child))); } else if (nodes.contains(child)) { lowLinks.put(o, Math.min(lowLinks.get(o), indexes.get(child))); @@ -246,7 +288,7 @@ private void connect(Operator o, AtomicInteger index, Stack> node } if (lowLinks.get(o).equals(indexes.get(o))) { - Set> component = new HashSet>(); + Set> component = new LinkedHashSet>(); components.add(component); Operator current; do { @@ -315,14 +357,6 @@ private void runDynamicPartitionPruning(OptimizeTezProcContext procCtx, Set> rootTasks, Pa GenTezUtils.processFileSink(procCtx, fileSink); } + // Connect any edges required for min/max pushdown + if (pCtx.getRsToRuntimeValuesInfoMap().size() > 0) { + for (ReduceSinkOperator rs : pCtx.getRsToRuntimeValuesInfoMap().keySet()) { + // Process min/max + GenTezUtils.processDynamicMinMaxPushDownOperator( + procCtx, pCtx.getRsToRuntimeValuesInfoMap().get(rs), rs); + } + } // and finally we hook up any events that need to be sent to the tez AM LOG.debug("There are " + procCtx.eventOperatorSet.size() + " app master events."); for (AppMasterEventOperator event : procCtx.eventOperatorSet) { @@ -528,4 +570,256 @@ protected void optimizeTaskPlan(List> rootTasks, Pa perfLogger.PerfLogEnd(this.getClass().getName(), PerfLogger.TEZ_COMPILER, "optimizeTaskPlan"); return; } + + private static class SemijoinRemovalContext implements NodeProcessorCtx { + List> parents = new ArrayList>(); + } + + private static class SemijoinRemovalProc implements NodeProcessor { + + @Override + public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, + Object... nodeOutputs) throws SemanticException { + SemijoinRemovalContext ctx = (SemijoinRemovalContext) procCtx; + Operator parent = (Operator) stack.get(stack.size() - 2); + ctx.parents.add(parent); + return null; + } + } + + private static void collectSemijoinOps(Operator ts, NodeProcessorCtx ctx) throws SemanticException { + // create a walker which walks the tree in a DFS manner while maintaining + // the operator stack. The dispatcher + // generates the plan from the operator tree + Map opRules = new LinkedHashMap(); + opRules.put(new RuleRegExp("R1", SelectOperator.getOperatorName() + "%" + + TezDummyStoreOperator.getOperatorName() + "%"), + new SemijoinRemovalProc()); + opRules.put(new RuleRegExp("R2", SelectOperator.getOperatorName() + "%" + + CommonMergeJoinOperator.getOperatorName() + "%"), + new SemijoinRemovalProc()); + Dispatcher disp = new DefaultRuleDispatcher(null, opRules, ctx); + GraphWalker ogw = new PreOrderOnceWalker(disp); + List startNodes = new ArrayList(); + startNodes.add(ts); + + HashMap outputMap = new HashMap(); + ogw.startWalking(startNodes, null); + } + + private static class SMBJoinOpProc implements NodeProcessor { + + @Override + public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, + Object... nodeOutputs) throws SemanticException { + List tsOps = new ArrayList(); + // Get one top level TS Op directly from the stack + tsOps.add((TableScanOperator)stack.get(0)); + + // Get the other one by examining Join Op + List> parents = ((CommonMergeJoinOperator) nd).getParentOperators(); + for (Operator parent : parents) { + if (parent instanceof TezDummyStoreOperator) { + // already accounted for + continue; + } + + assert parent instanceof SelectOperator; + while(parent != null) { + if (parent instanceof TableScanOperator) { + tsOps.add((TableScanOperator) parent); + break; + } + parent = parent.getParentOperators().get(0); + } + } + + // Now the relevant TableScanOperators are known, find if there exists + // a semijoin filter on any of them, if so, remove it. + ParseContext pctx = ((OptimizeTezProcContext) procCtx).parseContext; + for (TableScanOperator ts : tsOps) { + for (ReduceSinkOperator rs : pctx.getRsOpToTsOpMap().keySet()) { + if (ts == pctx.getRsOpToTsOpMap().get(rs)) { + // match! + GenTezUtils.removeBranch(rs); + GenTezUtils.removeSemiJoinOperator(pctx, rs, ts); + } + } + } + return null; + } + } + + private static void removeSemijoinOptimizationFromSMBJoins( + OptimizeTezProcContext procCtx) throws SemanticException { + if (!procCtx.conf.getBoolVar(ConfVars.TEZ_DYNAMIC_SEMIJOIN_REDUCTION) || + procCtx.parseContext.getRsOpToTsOpMap().size() == 0) { + return; + } + + Map opRules = new LinkedHashMap(); + opRules.put( + new RuleRegExp("R1", TableScanOperator.getOperatorName() + "%" + + ".*" + TezDummyStoreOperator.getOperatorName() + "%" + + CommonMergeJoinOperator.getOperatorName() + "%"), + new SMBJoinOpProc()); + + // The dispatcher finds SMB and if there is semijoin optimization before it, removes it. + Dispatcher disp = new DefaultRuleDispatcher(null, opRules, procCtx); + List topNodes = new ArrayList(); + topNodes.addAll(procCtx.parseContext.getTopOps().values()); + GraphWalker ogw = new PreOrderOnceWalker(disp); + ogw.startWalking(topNodes, null); + } + + private static class SemiJoinCycleRemovalDueToMapsideJoins implements NodeProcessor { + + @Override + public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, + Object... nodeOutputs) throws SemanticException { + ParseContext pCtx = ((OptimizeTezProcContext) procCtx).parseContext; + Operator childJoin = ((Operator) nd); + Operator parentJoin = ((Operator) stack.get(stack.size() - 2)); + + if (parentJoin.getChildOperators().size() == 1) { + // Nothing to do here + return null; + } + + for (Operator child : parentJoin.getChildOperators()) { + if (!(child instanceof SelectOperator)) { + continue; + } + + while(child.getChildOperators().size() > 0) { + child = child.getChildOperators().get(0); + } + + if (!(child instanceof ReduceSinkOperator)) { + continue; + } + + ReduceSinkOperator rs = ((ReduceSinkOperator) child); + TableScanOperator ts = pCtx.getRsOpToTsOpMap().get(rs); + if (ts == null) { + continue; + } + // This is a semijoin branch. Find if this is creating a potential + // cycle with childJoin. + for (Operator parent : childJoin.getParentOperators()) { + if (parent == parentJoin) { + continue; + } + + assert parent instanceof ReduceSinkOperator; + while (parent.getParentOperators().size() > 0) { + parent = parent.getParentOperators().get(0); + } + + if (parent == ts) { + // We have a cycle! + GenTezUtils.removeBranch(rs); + GenTezUtils.removeSemiJoinOperator(pCtx, rs, ts); + } + } + } + return null; + } + } + + private static void removeSemiJoinCyclesDueToMapsideJoins( + OptimizeTezProcContext procCtx) throws SemanticException { + if (!procCtx.conf.getBoolVar(ConfVars.TEZ_DYNAMIC_SEMIJOIN_REDUCTION) || + procCtx.parseContext.getRsOpToTsOpMap().size() == 0) { + return; + } + + Map opRules = new LinkedHashMap(); + opRules.put( + new RuleRegExp("R1", MapJoinOperator.getOperatorName() + "%" + + MapJoinOperator.getOperatorName() + "%"), + new SemiJoinCycleRemovalDueToMapsideJoins()); + opRules.put( + new RuleRegExp("R2", MapJoinOperator.getOperatorName() + "%" + + CommonMergeJoinOperator.getOperatorName() + "%"), + new SemiJoinCycleRemovalDueToMapsideJoins()); + opRules.put( + new RuleRegExp("R3", CommonMergeJoinOperator.getOperatorName() + "%" + + MapJoinOperator.getOperatorName() + "%"), + new SemiJoinCycleRemovalDueToMapsideJoins()); + opRules.put( + new RuleRegExp("R4", CommonMergeJoinOperator.getOperatorName() + "%" + + CommonMergeJoinOperator.getOperatorName() + "%"), + new SemiJoinCycleRemovalDueToMapsideJoins()); + + Dispatcher disp = new DefaultRuleDispatcher(null, opRules, procCtx); + List topNodes = new ArrayList(); + topNodes.addAll(procCtx.parseContext.getTopOps().values()); + GraphWalker ogw = new PreOrderOnceWalker(disp); + ogw.startWalking(topNodes, null); + } + + private static class SemiJoinRemovalIfNoStatsProc implements NodeProcessor { + + @Override + public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, + Object... nodeOutputs) throws SemanticException { + assert nd instanceof ReduceSinkOperator; + ReduceSinkOperator rs = (ReduceSinkOperator) nd; + ParseContext pCtx = ((OptimizeTezProcContext) procCtx).parseContext; + TableScanOperator ts = pCtx.getRsOpToTsOpMap().get(rs); + if (ts == null) { + // nothing to do here. + return null; + } + + // This is a semijoin branch. The stack should look like, + // -SEL-GB1-RS1-GB2-RS2 + GroupByOperator gbOp = (GroupByOperator) (stack.get(stack.size() - 2)); + GroupByDesc gbDesc = gbOp.getConf(); + ArrayList aggregationDescs = gbDesc.getAggregators(); + boolean removeSemiJoin = false; + for (AggregationDesc agg : aggregationDescs) { + if (agg.getGenericUDAFName() != "bloom_filter") { + continue; + } + + GenericUDAFBloomFilterEvaluator udafBloomFilterEvaluator = + (GenericUDAFBloomFilterEvaluator) agg.getGenericUDAFEvaluator(); + long expectedEntries = udafBloomFilterEvaluator.getExpectedEntries(); + if (expectedEntries == -1 || expectedEntries > + pCtx.getConf().getLongVar(ConfVars.TEZ_MAX_BLOOM_FILTER_ENTRIES)) { + removeSemiJoin = true; + break; + } + } + if (removeSemiJoin) { + // The stats are not annotated, remove the semijoin operator + GenTezUtils.removeBranch(rs); + GenTezUtils.removeSemiJoinOperator(pCtx, rs, ts); + } + return null; + } + } + + private void removeSemiJoinIfNoStats(OptimizeTezProcContext procCtx) + throws SemanticException { + if(!procCtx.conf.getBoolVar(ConfVars.TEZ_DYNAMIC_SEMIJOIN_REDUCTION)) { + // Not needed without semi-join reduction + return; + } + + Map opRules = new LinkedHashMap(); + opRules.put( + new RuleRegExp("R1", GroupByOperator.getOperatorName() + "%" + + ReduceSinkOperator.getOperatorName() + "%" + + GroupByOperator.getOperatorName() + "%" + + ReduceSinkOperator.getOperatorName() + "%"), + new SemiJoinRemovalIfNoStatsProc()); + Dispatcher disp = new DefaultRuleDispatcher(null, opRules, procCtx); + List topNodes = new ArrayList(); + topNodes.addAll(procCtx.parseContext.getTopOps().values()); + GraphWalker ogw = new PreOrderOnceWalker(disp); + ogw.startWalking(topNodes, null); + } } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/plan/AggregationDesc.java b/ql/src/java/org/apache/hadoop/hive/ql/plan/AggregationDesc.java index 1ecbaad..f0b062e 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/plan/AggregationDesc.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/plan/AggregationDesc.java @@ -152,6 +152,13 @@ public String getExprString() { } sb.append(exp.getExprString()); } + + String evaluatorExpr = getGenericUDAFEvaluator().getExprString(); + if (evaluatorExpr != null && !evaluatorExpr.isEmpty()) { + sb.append(", "); + sb.append(evaluatorExpr); + } + sb.append(")"); return sb.toString(); } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/plan/BaseWork.java b/ql/src/java/org/apache/hadoop/hive/ql/plan/BaseWork.java index 13a0811..8c341fc 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/plan/BaseWork.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/plan/BaseWork.java @@ -19,6 +19,7 @@ package org.apache.hadoop.hive.ql.plan; import java.util.ArrayList; +import java.util.HashMap; import java.util.LinkedList; import java.util.LinkedHashSet; import java.util.List; @@ -32,6 +33,7 @@ import org.apache.hadoop.hive.ql.exec.HashTableDummyOperator; import org.apache.hadoop.hive.ql.exec.Operator; import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatchCtx; +import org.apache.hadoop.hive.ql.parse.RuntimeValuesInfo; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.hive.ql.plan.Explain.Level; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; @@ -76,6 +78,10 @@ public BaseWork(String name) { private int reservedMemoryMB = -1; // default to -1 means we leave it up to Tez to decide + // Used for value registry + private Map inputSourceToRuntimeValuesInfo = + new HashMap(); + public void setGatheringStats(boolean gatherStats) { this.gatheringStats = gatherStats; } @@ -251,4 +257,13 @@ public void addSortCols(List sortCols) { public List getSortCols() { return sortColNames; } + + public Map getInputSourceToRuntimeValuesInfo() { + return inputSourceToRuntimeValuesInfo; + } + + public void setInputSourceToRuntimeValuesInfo( + String workName, RuntimeValuesInfo runtimeValuesInfo) { + inputSourceToRuntimeValuesInfo.put(workName, runtimeValuesInfo); + } } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/plan/DynamicValue.java b/ql/src/java/org/apache/hadoop/hive/ql/plan/DynamicValue.java new file mode 100644 index 0000000..874c62b --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/plan/DynamicValue.java @@ -0,0 +1,137 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.plan; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.ql.exec.DynamicValueRegistry; +import org.apache.hadoop.hive.ql.exec.ObjectCache; +import org.apache.hadoop.hive.ql.exec.ObjectCacheFactory; +import org.apache.hadoop.hive.ql.exec.Utilities; +import org.apache.hadoop.hive.ql.io.sarg.LiteralDelegate; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; + +import java.io.Serializable; + + +public class DynamicValue implements LiteralDelegate, Serializable { + + private static final long serialVersionUID = 1L; + + public static final String DYNAMIC_VALUE_REGISTRY_CACHE_KEY = "DynamicValueRegistry"; + + protected transient Configuration conf; + + protected String id; + TypeInfo typeInfo; + PrimitiveObjectInspector objectInspector; + + transient protected Object val; + transient boolean initialized = false; + + public DynamicValue(String id, TypeInfo typeInfo) { + this.id = id; + this.typeInfo = typeInfo; + this.objectInspector = (PrimitiveObjectInspector) TypeInfoUtils.getStandardWritableObjectInspectorFromTypeInfo(typeInfo); + } + + @Override + public void setConf(Configuration conf) { + this.conf = conf; + } + + @Override + public Configuration getConf() { + return conf; + } + + public TypeInfo getTypeInfo() { + return typeInfo; + } + + public void setTypeInfo(TypeInfo typeInfo) { + this.typeInfo = typeInfo; + } + + public PrimitiveObjectInspector getObjectInspector() { + return objectInspector; + } + + public void setObjectInspector(PrimitiveObjectInspector objectInspector) { + this.objectInspector = objectInspector; + } + + @Override + public String getId() { return id;} + + public void setId(String id) { + this.id = id; + } + + @Override + public Object getLiteral() { + return getJavaValue(); + } + + public Object getJavaValue() { + return objectInspector.getPrimitiveJavaObject(getValue()); + } + + public Object getWritableValue() { + return objectInspector.getPrimitiveWritableObject(getValue()); + } + + public Object getValue() { + if (initialized) { + return val; + } + + if (conf == null) { + throw new IllegalStateException("Cannot retrieve dynamic value " + id + " - no conf set"); + } + + try { + // Get object cache + String queryId = HiveConf.getVar(conf, HiveConf.ConfVars.HIVEQUERYID); + ObjectCache cache = ObjectCacheFactory.getCache(conf, queryId, false); + + // Get the registry + DynamicValueRegistry valueRegistry = cache.retrieve(DYNAMIC_VALUE_REGISTRY_CACHE_KEY); + if (valueRegistry == null) { + throw new IllegalStateException("DynamicValueRegistry not available"); + } + val = valueRegistry.getValue(id); + initialized = true; + } catch (Exception err) { + throw new IllegalStateException("Failed to retrieve dynamic value for " + id, err); + } + + return val; + } + + @Override + public String toString() { + // If the id is a generated unique ID then this could affect .q file golden files for tests that run EXPLAIN queries. + return "DynamicValue(" + id + ")"; + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeDynamicValueDesc.java b/ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeDynamicValueDesc.java new file mode 100644 index 0000000..c9e7b67 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeDynamicValueDesc.java @@ -0,0 +1,76 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.plan; + +import java.io.Serializable; + + +/** + * This expression represents a value that will be available at runtime. + * + */ +public class ExprNodeDynamicValueDesc extends ExprNodeDesc implements Serializable { + + private static final long serialVersionUID = 1L; + + protected DynamicValue dynamicValue; + + public ExprNodeDynamicValueDesc() { + } + + public ExprNodeDynamicValueDesc(DynamicValue value) { + super(value.getTypeInfo()); + this.dynamicValue = value; + } + + @Override + public ExprNodeDesc clone() { + return new ExprNodeDynamicValueDesc(dynamicValue); + } + + @Override + public boolean isSame(Object o) { + if (o instanceof ExprNodeDynamicValueDesc) { + Object otherValue = ((ExprNodeDynamicValueDesc) o).getDynamicValue(); + if (dynamicValue == null) { + return otherValue == null; + } + return dynamicValue.equals(otherValue); + } + return false; + } + + public DynamicValue getDynamicValue() { + return dynamicValue; + } + + public void setValue(DynamicValue value) { + this.dynamicValue = value; + } + + @Override + public String getExprString() { + return dynamicValue != null ? dynamicValue.toString() : "null dynamic literal"; + } + + @Override + public String toString() { + return dynamicValue != null ? dynamicValue.toString() : "null dynamic literal"; + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFBloomFilter.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFBloomFilter.java new file mode 100644 index 0000000..fb9a140 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFBloomFilter.java @@ -0,0 +1,267 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.udf.generic; + +import org.apache.hadoop.hive.common.type.HiveDecimal; +import org.apache.hadoop.hive.ql.exec.Operator; +import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.parse.SemanticException; +import org.apache.hadoop.hive.serde2.io.DateWritable; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.*; +import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; +import org.apache.hadoop.io.BytesWritable; +import org.apache.hadoop.io.Text; +import org.apache.hive.common.util.BloomFilter; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.sql.Timestamp; + +/** + * Generic UDF to generate Bloom Filter + */ +public class GenericUDAFBloomFilter implements GenericUDAFResolver2 { + + private static final Logger LOG = LoggerFactory.getLogger(GenericUDAFBloomFilter.class); + + @Override + public GenericUDAFEvaluator getEvaluator(GenericUDAFParameterInfo info) throws SemanticException { + return new GenericUDAFBloomFilterEvaluator(); + } + + @Override + public GenericUDAFEvaluator getEvaluator(TypeInfo[] parameters) throws SemanticException { + return new GenericUDAFBloomFilterEvaluator(); + } + + /** + * GenericUDAFBloomFilterEvaluator - Evaluator class for BloomFilter + */ + public static class GenericUDAFBloomFilterEvaluator extends GenericUDAFEvaluator { + // Source operator to get the number of entries + private Operator sourceOperator; + private long maxEntries = 0; + + // ObjectInspector for input data. + private PrimitiveObjectInspector inputOI; + + // Bloom filter rest + private ByteArrayOutputStream result = new ByteArrayOutputStream(); + + @Override + public ObjectInspector init(Mode m, ObjectInspector[] parameters) throws HiveException { + super.init(m, parameters); + + // Initialize input + if (mode == Mode.PARTIAL1 || mode == Mode.COMPLETE) { + inputOI = (PrimitiveObjectInspector) parameters[0]; + } else { + // Do nothing for other modes + } + + // Output will be same in both partial or full aggregation modes. + // It will be a BloomFilter in ByteWritable + return PrimitiveObjectInspectorFactory.writableBinaryObjectInspector; + } + + /** + * Class for storing the BloomFilter + */ + @AggregationType(estimable = true) + static class BloomFilterBuf extends AbstractAggregationBuffer { + BloomFilter bloomFilter; + + public BloomFilterBuf(long expectedEntries, long maxEntries) { + if (expectedEntries > maxEntries) { + bloomFilter = new BloomFilter(1); + } else { + bloomFilter = new BloomFilter(expectedEntries); + } + } + + @Override + public int estimate() { + return (int) bloomFilter.sizeInBytes(); + } + } + + @Override + public void reset(AggregationBuffer agg) throws HiveException { + ((BloomFilterBuf)agg).bloomFilter.reset(); + } + + @Override + public AggregationBuffer getNewAggregationBuffer() throws HiveException { + long expectedEntries = getExpectedEntries(); + if (expectedEntries < 0) { + throw new IllegalStateException("BloomFilter expectedEntries not initialized"); + } + + BloomFilterBuf buf = new BloomFilterBuf(expectedEntries, maxEntries); + reset(buf); + return buf; + } + + @Override + public void iterate(AggregationBuffer agg, Object[] parameters) throws HiveException { + if (parameters == null || parameters[0] == null) { + // 2nd condition occurs when the input has 0 rows (possible due to + // filtering, joins etc). + return; + } + + BloomFilter bf = ((BloomFilterBuf)agg).bloomFilter; + + // Add the expression into the BloomFilter + switch (inputOI.getPrimitiveCategory()) { + case BOOLEAN: + boolean vBoolean = ((BooleanObjectInspector)inputOI).get(parameters[0]); + bf.addLong(vBoolean ? 1 : 0); + break; + case BYTE: + byte vByte = ((ByteObjectInspector)inputOI).get(parameters[0]); + bf.addLong(vByte); + break; + case SHORT: + short vShort = ((ShortObjectInspector)inputOI).get(parameters[0]); + bf.addLong(vShort); + break; + case INT: + int vInt = ((IntObjectInspector)inputOI).get(parameters[0]); + bf.addLong(vInt); + break; + case LONG: + long vLong = ((LongObjectInspector)inputOI).get(parameters[0]); + bf.addLong(vLong); + break; + case FLOAT: + float vFloat = ((FloatObjectInspector)inputOI).get(parameters[0]); + bf.addDouble(vFloat); + break; + case DOUBLE: + double vDouble = ((DoubleObjectInspector)inputOI).get(parameters[0]); + bf.addDouble(vDouble); + break; + case DECIMAL: + HiveDecimal vDecimal = ((HiveDecimalObjectInspector)inputOI). + getPrimitiveJavaObject(parameters[0]); + bf.addString(vDecimal.toString()); + break; + case DATE: + DateWritable vDate = ((DateObjectInspector)inputOI). + getPrimitiveWritableObject(parameters[0]); + bf.addLong(vDate.getDays()); + break; + case TIMESTAMP: + Timestamp vTimeStamp = ((TimestampObjectInspector)inputOI). + getPrimitiveJavaObject(parameters[0]); + bf.addLong(vTimeStamp.getTime()); + break; + case CHAR: + Text vChar = ((HiveCharObjectInspector)inputOI). + getPrimitiveWritableObject(parameters[0]).getStrippedValue(); + bf.addBytes(vChar.getBytes(), 0, vChar.getLength()); + break; + case VARCHAR: + Text vVarChar = ((HiveVarcharObjectInspector)inputOI). + getPrimitiveWritableObject(parameters[0]).getTextValue(); + bf.addBytes(vVarChar.getBytes(), 0, vVarChar.getLength()); + break; + case STRING: + Text vString = ((StringObjectInspector)inputOI). + getPrimitiveWritableObject(parameters[0]); + bf.addBytes(vString.getBytes(), 0, vString.getLength()); + break; + case BINARY: + BytesWritable vBytes = ((BinaryObjectInspector)inputOI). + getPrimitiveWritableObject(parameters[0]); + bf.addBytes(vBytes.getBytes(), 0, vBytes.getLength()); + break; + default: + throw new UDFArgumentTypeException(0, + "Bad primitive category " + inputOI.getPrimitiveCategory()); + } + } + + @Override + public void merge(AggregationBuffer agg, Object partial) throws HiveException { + if (partial == null) { + return; + } + + BytesWritable bytes = (BytesWritable) partial; + ByteArrayInputStream in = new ByteArrayInputStream(bytes.getBytes()); + // Deserialze the bloomfilter + try { + BloomFilter bf = BloomFilter.deserialize(in); + ((BloomFilterBuf)agg).bloomFilter.merge(bf); + } catch (IOException e) { + throw new HiveException(e); + } + } + + @Override + public Object terminate(AggregationBuffer agg) throws HiveException { + result.reset(); + try { + BloomFilter.serialize(result, ((BloomFilterBuf)agg).bloomFilter); + } catch (IOException e) { + throw new HiveException(e); + } + return new BytesWritable(result.toByteArray()); + } + + @Override + public Object terminatePartial(AggregationBuffer agg) throws HiveException { + return terminate(agg); + } + + public long getExpectedEntries() { + if (sourceOperator != null && sourceOperator.getStatistics() != null) { + return sourceOperator.getStatistics().getNumRows(); + } + return -1; + } + + public Operator getSourceOperator() { + return sourceOperator; + } + + public void setSourceOperator(Operator sourceOperator) { + this.sourceOperator = sourceOperator; + } + + public void setMaxEntries(long maxEntries) { + this.maxEntries = maxEntries; + } + + @Override + public String getExprString() { + return "expectedEntries=" + getExpectedEntries(); + } + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFEvaluator.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFEvaluator.java index 18d5285..3a98276 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFEvaluator.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFEvaluator.java @@ -262,6 +262,13 @@ public GenericUDAFEvaluator getWindowingEvaluator(WindowFrameDef wFrmDef) { return null; } + /** + * Optional information to add to expression string. Subclasses can override. + */ + public String getExprString() { + return ""; + } + protected BasePartitionEvaluator partitionEvaluator; /** diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFInBloomFilter.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFInBloomFilter.java new file mode 100644 index 0000000..1b7de6c --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFInBloomFilter.java @@ -0,0 +1,168 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.udf.generic; + +import org.apache.hadoop.hive.common.type.HiveDecimal; +import org.apache.hadoop.hive.ql.exec.UDFArgumentException; +import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException; +import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.serde2.io.DateWritable; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.*; +import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import org.apache.hadoop.io.BytesWritable; +import org.apache.hadoop.io.Text; +import org.apache.hive.common.util.BloomFilter; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.sql.Timestamp; + +/** + * GenericUDF to lookup a value in BloomFilter + */ +public class GenericUDFInBloomFilter extends GenericUDF { + private static final Logger LOG = LoggerFactory.getLogger(GenericUDFInBloomFilter.class); + + private transient ObjectInspector valObjectInspector; + private transient ObjectInspector bloomFilterObjectInspector; + private transient BloomFilter bloomFilter; + private transient boolean initializedBloomFilter; + + @Override + public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException { + if (arguments.length != 2) { + throw new UDFArgumentLengthException( + "InBloomFilter requires exactly 2 arguments but got " + arguments.length); + } + + // Verify individual arguments + if (arguments[0].getCategory() != ObjectInspector.Category.PRIMITIVE) { + throw new UDFArgumentTypeException(0, "The 1st argument must be a primitive type but " + + arguments[0].getTypeName() + " was passed"); + } + + if (((PrimitiveObjectInspector) arguments[1]).getPrimitiveCategory() != + PrimitiveObjectInspector.PrimitiveCategory.BINARY) { + throw new UDFArgumentTypeException(1, "The 2nd argument must be a binary type but " + + arguments[1].getTypeName() + " was passed"); + } + + valObjectInspector = arguments[0]; + bloomFilterObjectInspector = arguments[1]; + assert bloomFilterObjectInspector instanceof WritableBinaryObjectInspector; + + initializedBloomFilter = false; + return PrimitiveObjectInspectorFactory.javaBooleanObjectInspector; + } + + @Override + public String getDisplayString(String[] children) { + return getStandardDisplayString("in_bloom_filter", children); + } + + @Override + public Object evaluate(DeferredObject[] arguments) throws HiveException { + // Return if either of the arguments is null + if (arguments[0].get() == null || arguments[1].get() == null) { + return null; + } + + if (!initializedBloomFilter) { + // Setup the bloom filter once + try { + BytesWritable bw = (BytesWritable) arguments[1].get(); + byte[] bytes = new byte[bw.getLength()]; + System.arraycopy(bw.getBytes(), 0, bytes, 0, bw.getLength()); + bloomFilter = BloomFilter.deserialize(new ByteArrayInputStream(bytes)); + } catch ( IOException e) { + throw new HiveException(e); + } + initializedBloomFilter = true; + } + + // Check if the value is in bloom filter + switch (((PrimitiveObjectInspector)valObjectInspector). + getTypeInfo().getPrimitiveCategory()) { + case BOOLEAN: + boolean vBoolean = ((BooleanObjectInspector)valObjectInspector). + get(arguments[0].get()); + return bloomFilter.testLong(vBoolean ? 1 : 0); + case BYTE: + byte vByte = ((ByteObjectInspector) valObjectInspector). + get(arguments[0].get()); + return bloomFilter.testLong(vByte); + case SHORT: + short vShort = ((ShortObjectInspector) valObjectInspector). + get(arguments[0].get()); + return bloomFilter.testLong(vShort); + case INT: + int vInt = ((IntObjectInspector) valObjectInspector). + get(arguments[0].get()); + return bloomFilter.testLong(vInt); + case LONG: + long vLong = ((LongObjectInspector) valObjectInspector). + get(arguments[0].get()); + return bloomFilter.testLong(vLong); + case FLOAT: + float vFloat = ((FloatObjectInspector) valObjectInspector). + get(arguments[0].get()); + return bloomFilter.testDouble(vFloat); + case DOUBLE: + double vDouble = ((DoubleObjectInspector) valObjectInspector). + get(arguments[0].get()); + return bloomFilter.testDouble(vDouble); + case DECIMAL: + HiveDecimal vDecimal = ((HiveDecimalObjectInspector) valObjectInspector). + getPrimitiveJavaObject(arguments[0].get()); + return bloomFilter.testString(vDecimal.toString()); + case DATE: + DateWritable vDate = ((DateObjectInspector) valObjectInspector). + getPrimitiveWritableObject(arguments[0].get()); + return bloomFilter.testLong(vDate.getDays()); + case TIMESTAMP: + Timestamp vTimeStamp = ((TimestampObjectInspector) valObjectInspector). + getPrimitiveJavaObject(arguments[0].get()); + return bloomFilter.testLong(vTimeStamp.getTime()); + case CHAR: + Text vChar = ((HiveCharObjectInspector) valObjectInspector). + getPrimitiveWritableObject(arguments[0].get()).getStrippedValue(); + return bloomFilter.testBytes(vChar.getBytes(), 0, vChar.getLength()); + case VARCHAR: + Text vVarchar = ((HiveVarcharObjectInspector) valObjectInspector). + getPrimitiveWritableObject(arguments[0].get()).getTextValue(); + return bloomFilter.testBytes(vVarchar.getBytes(), 0, vVarchar.getLength()); + case STRING: + Text vString = ((StringObjectInspector) valObjectInspector). + getPrimitiveWritableObject(arguments[0].get()); + return bloomFilter.testBytes(vString.getBytes(), 0, vString.getLength()); + case BINARY: + BytesWritable vBytes = ((BinaryObjectInspector) valObjectInspector). + getPrimitiveWritableObject(arguments[0].get()); + return bloomFilter.testBytes(vBytes.getBytes(), 0, vBytes.getLength()); + default: + throw new UDFArgumentTypeException(0, "Bad primitive category " + + ((PrimitiveTypeInfo) valObjectInspector).getPrimitiveCategory()); + } + } +} diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/sarg/TestConvertAstToSearchArg.java b/ql/src/test/org/apache/hadoop/hive/ql/io/sarg/TestConvertAstToSearchArg.java index 93b50a6..6563290 100644 --- a/ql/src/test/org/apache/hadoop/hive/ql/io/sarg/TestConvertAstToSearchArg.java +++ b/ql/src/test/org/apache/hadoop/hive/ql/io/sarg/TestConvertAstToSearchArg.java @@ -28,6 +28,7 @@ import java.util.List; import java.util.Set; +import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hive.ql.exec.SerializationUtilities; import org.apache.hadoop.hive.ql.io.parquet.read.ParquetFilterPredicateConverter; import org.apache.hadoop.hive.ql.io.sarg.SearchArgument.TruthValue; @@ -44,6 +45,8 @@ */ public class TestConvertAstToSearchArg { + private final Configuration conf = new Configuration(); + private static void assertNoSharedNodes(ExpressionTree tree, Set seen ) throws Exception { @@ -547,7 +550,7 @@ public void testExpression1() throws Exception { " \n"; SearchArgumentImpl sarg = - (SearchArgumentImpl) ConvertAstToSearchArg.create(getFuncDesc(exprStr)); + (SearchArgumentImpl) ConvertAstToSearchArg.create(conf, getFuncDesc(exprStr)); List leaves = sarg.getLeaves(); assertEquals(9, leaves.size()); @@ -836,7 +839,7 @@ public void testExpression2() throws Exception { " \n"; SearchArgumentImpl sarg = - (SearchArgumentImpl) ConvertAstToSearchArg.create(getFuncDesc(exprStr)); + (SearchArgumentImpl) ConvertAstToSearchArg.create(conf, getFuncDesc(exprStr)); List leaves = sarg.getLeaves(); assertEquals(4, leaves.size()); @@ -1269,7 +1272,7 @@ public void testExpression3() throws Exception { " \n"; SearchArgumentImpl sarg = - (SearchArgumentImpl) ConvertAstToSearchArg.create(getFuncDesc(exprStr)); + (SearchArgumentImpl) ConvertAstToSearchArg.create(conf, getFuncDesc(exprStr)); List leaves = sarg.getLeaves(); assertEquals(3, leaves.size()); @@ -1493,7 +1496,7 @@ id in (34,50) */ "\n"; SearchArgumentImpl sarg = - (SearchArgumentImpl) ConvertAstToSearchArg.create(getFuncDesc(exprStr)); + (SearchArgumentImpl) ConvertAstToSearchArg.create(conf, getFuncDesc(exprStr)); List leaves = sarg.getLeaves(); assertEquals(3, leaves.size()); @@ -1763,7 +1766,7 @@ public void testExpression5() throws Exception { " \n"; SearchArgumentImpl sarg = - (SearchArgumentImpl) ConvertAstToSearchArg.create(getFuncDesc(exprStr)); + (SearchArgumentImpl) ConvertAstToSearchArg.create(conf, getFuncDesc(exprStr)); List leaves = sarg.getLeaves(); assertEquals(1, leaves.size()); @@ -2246,7 +2249,7 @@ public void testExpression7() throws Exception { ""; SearchArgumentImpl sarg = - (SearchArgumentImpl) ConvertAstToSearchArg.create(getFuncDesc(exprStr)); + (SearchArgumentImpl) ConvertAstToSearchArg.create(conf, getFuncDesc(exprStr)); List leaves = sarg.getLeaves(); assertEquals(9, leaves.size()); @@ -2405,7 +2408,7 @@ public void testExpression8() throws Exception { " "; SearchArgumentImpl sarg = - (SearchArgumentImpl) ConvertAstToSearchArg.create(getFuncDesc(exprStr)); + (SearchArgumentImpl) ConvertAstToSearchArg.create(conf, getFuncDesc(exprStr)); List leaves = sarg.getLeaves(); assertEquals(0, leaves.size()); @@ -2538,7 +2541,7 @@ public void testExpression9() throws Exception { " "; SearchArgumentImpl sarg = - (SearchArgumentImpl) ConvertAstToSearchArg.create(getFuncDesc(exprStr)); + (SearchArgumentImpl) ConvertAstToSearchArg.create(conf, getFuncDesc(exprStr)); List leaves = sarg.getLeaves(); assertEquals(0, leaves.size()); @@ -2663,7 +2666,7 @@ public void testExpression10() throws Exception { ""; SearchArgumentImpl sarg = - (SearchArgumentImpl) ConvertAstToSearchArg.create(getFuncDesc(exprStr)); + (SearchArgumentImpl) ConvertAstToSearchArg.create(conf, getFuncDesc(exprStr)); List leaves = sarg.getLeaves(); assertEquals(1, leaves.size()); @@ -2712,7 +2715,7 @@ public void TestTimestampSarg() throws Exception { "AAABgj0BRVFVQcwBBW9yZy5hcGFjaGUuaGFkb29wLmlvLkJvb2xlYW5Xcml0YWJs5Q" + "EAAAECAQFib29sZWHu"; SearchArgument sarg = - new ConvertAstToSearchArg(SerializationUtilities.deserializeExpression(serialAst)) + new ConvertAstToSearchArg(conf, SerializationUtilities.deserializeExpression(serialAst)) .buildSearchArgument(); assertEquals("leaf-0", sarg.getExpression().toString()); assertEquals(1, sarg.getLeaves().size()); @@ -2731,7 +2734,7 @@ public void TestDateSarg() throws Exception { "Y2hlLmhhZG9vcC5oaXZlLnFsLnVkZi5nZW5lcmljLkdlbmVyaWNVREZPUEVxdWHsAQAAAYI9AUVRVUH" + "MAQVvcmcuYXBhY2hlLmhhZG9vcC5pby5Cb29sZWFuV3JpdGFibOUBAAABAgEBYm9vbGVh7g=="; SearchArgument sarg = - new ConvertAstToSearchArg(SerializationUtilities.deserializeExpression(serialAst)) + new ConvertAstToSearchArg(conf, SerializationUtilities.deserializeExpression(serialAst)) .buildSearchArgument(); assertEquals("leaf-0", sarg.getExpression().toString()); assertEquals(1, sarg.getLeaves().size()); @@ -2751,7 +2754,7 @@ public void TestDecimalSarg() throws Exception { "oaXZlLnFsLnVkZi5nZW5lcmljLkdlbmVyaWNVREZPUEVxdWHsAQAAAYI9AUVRVUHMAQZvcmcuYXBhY2" + "hlLmhhZG9vcC5pby5Cb29sZWFuV3JpdGFibOUBAAABBAEBYm9vbGVh7g=="; SearchArgument sarg = - new ConvertAstToSearchArg(SerializationUtilities.deserializeExpression(serialAst)) + new ConvertAstToSearchArg(conf, SerializationUtilities.deserializeExpression(serialAst)) .buildSearchArgument(); assertEquals("leaf-0", sarg.getExpression().toString()); assertEquals(1, sarg.getLeaves().size()); @@ -2771,7 +2774,7 @@ public void TestCharSarg() throws Exception { "vb3AuaGl2ZS5xbC51ZGYuZ2VuZXJpYy5HZW5lcmljVURGT1BFcXVh7AEAAAGCPQFFUVVBzAEGb3JnLm" + "FwYWNoZS5oYWRvb3AuaW8uQm9vbGVhbldyaXRhYmzlAQAAAQQBAWJvb2xlYe4="; SearchArgument sarg = - new ConvertAstToSearchArg(SerializationUtilities.deserializeExpression(serialAst)) + new ConvertAstToSearchArg(conf, SerializationUtilities.deserializeExpression(serialAst)) .buildSearchArgument(); assertEquals("leaf-0", sarg.getExpression().toString()); assertEquals(1, sarg.getLeaves().size()); @@ -2791,7 +2794,7 @@ public void TestVarcharSarg() throws Exception { "lLmhhZG9vcC5oaXZlLnFsLnVkZi5nZW5lcmljLkdlbmVyaWNVREZPUEVxdWHsAQAAAYI9AUVRVUHMAQ" + "ZvcmcuYXBhY2hlLmhhZG9vcC5pby5Cb29sZWFuV3JpdGFibOUBAAABBAEBYm9vbGVh7g=="; SearchArgument sarg = - new ConvertAstToSearchArg(SerializationUtilities.deserializeExpression(serialAst)) + new ConvertAstToSearchArg(conf, SerializationUtilities.deserializeExpression(serialAst)) .buildSearchArgument(); assertEquals("leaf-0", sarg.getExpression().toString()); assertEquals(1, sarg.getLeaves().size()); @@ -2810,7 +2813,7 @@ public void TestBigintSarg() throws Exception { "dmUucWwudWRmLmdlbmVyaWMuR2VuZXJpY1VERk9QRXF1YewBAAABgj0BRVFVQcwBBW9yZy5hcGFjaGU" + "uaGFkb29wLmlvLkJvb2xlYW5Xcml0YWJs5QEAAAECAQFib29sZWHu"; SearchArgument sarg = - new ConvertAstToSearchArg(SerializationUtilities.deserializeExpression(serialAst)) + new ConvertAstToSearchArg(conf, SerializationUtilities.deserializeExpression(serialAst)) .buildSearchArgument(); assertEquals("leaf-0", sarg.getExpression().toString()); assertEquals(1, sarg.getLeaves().size()); @@ -2831,7 +2834,7 @@ public void TestBooleanSarg() throws Exception { "hlLmhhZG9vcC5pby5Cb29sZWFuV3JpdGFibOUBAAABAwkBAgEBYrIAAAgBAwkBB29yZy5hcGFjaGUua" + "GFkb29wLmhpdmUucWwudWRmLmdlbmVyaWMuR2VuZXJpY1VERk9QQW7kAQEGAQAAAQMJ"; SearchArgument sarg = - new ConvertAstToSearchArg(SerializationUtilities.deserializeExpression(serialAst)) + new ConvertAstToSearchArg(conf, SerializationUtilities.deserializeExpression(serialAst)) .buildSearchArgument(); assertEquals("(and leaf-0 leaf-1)", sarg.getExpression().toString()); assertEquals(2, sarg.getLeaves().size()); @@ -2853,7 +2856,7 @@ public void TestFloatSarg() throws Exception { "aXZlLnFsLnVkZi5nZW5lcmljLkdlbmVyaWNVREZPUEVxdWHsAQAAAYI9AUVRVUHMAQVvcmcuYXBhY2h" + "lLmhhZG9vcC5pby5Cb29sZWFuV3JpdGFibOUBAAABAgEBYm9vbGVh7g=="; SearchArgument sarg = - new ConvertAstToSearchArg(SerializationUtilities.deserializeExpression(serialAst)) + new ConvertAstToSearchArg(conf, SerializationUtilities.deserializeExpression(serialAst)) .buildSearchArgument(); assertEquals("leaf-0", sarg.getExpression().toString()); assertEquals(1, sarg.getLeaves().size()); @@ -2872,7 +2875,7 @@ public void TestDoubleSarg() throws Exception { "b29wLmhpdmUucWwudWRmLmdlbmVyaWMuR2VuZXJpY1VERk9QRXF1YewBAAABgj0BRVFVQcwBBW9yZy5" + "hcGFjaGUuaGFkb29wLmlvLkJvb2xlYW5Xcml0YWJs5QEAAAECAQFib29sZWHu"; SearchArgument sarg = - new ConvertAstToSearchArg(SerializationUtilities.deserializeExpression(serialAst)) + new ConvertAstToSearchArg(conf, SerializationUtilities.deserializeExpression(serialAst)) .buildSearchArgument(); assertEquals("leaf-0", sarg.getExpression().toString()); assertEquals(1, sarg.getLeaves().size()); diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/sarg/TestSearchArgumentImpl.java b/ql/src/test/org/apache/hadoop/hive/ql/io/sarg/TestSearchArgumentImpl.java index 8cbc26d..df42058 100644 --- a/ql/src/test/org/apache/hadoop/hive/ql/io/sarg/TestSearchArgumentImpl.java +++ b/ql/src/test/org/apache/hadoop/hive/ql/io/sarg/TestSearchArgumentImpl.java @@ -79,7 +79,7 @@ public static PredicateLeaf createPredicateLeaf(PredicateLeaf.Operator operator, Object literal, List literalList) { return new SearchArgumentImpl.PredicateLeafImpl(operator, type, columnName, - literal, literalList); + literal, literalList, null); } @Test diff --git a/ql/src/test/org/apache/hadoop/hive/ql/optimizer/physical/TestVectorizer.java b/ql/src/test/org/apache/hadoop/hive/ql/optimizer/physical/TestVectorizer.java index 3295372..59cb31e 100644 --- a/ql/src/test/org/apache/hadoop/hive/ql/optimizer/physical/TestVectorizer.java +++ b/ql/src/test/org/apache/hadoop/hive/ql/optimizer/physical/TestVectorizer.java @@ -77,20 +77,24 @@ public String getDisplayString(String[] children) { @Test public void testAggregateOnUDF() throws HiveException { - AggregationDesc aggDesc = new AggregationDesc(); - aggDesc.setGenericUDAFName("sum"); - ExprNodeGenericFuncDesc exprNodeDesc = new ExprNodeGenericFuncDesc(); - exprNodeDesc.setTypeInfo(TypeInfoFactory.intTypeInfo); - ArrayList params = new ArrayList(); - params.add(exprNodeDesc); - aggDesc.setParameters(params); - GenericUDFAbs absUdf = new GenericUDFAbs(); - exprNodeDesc.setGenericUDF(absUdf); - List children = new ArrayList(); ExprNodeColumnDesc colExprA = new ExprNodeColumnDesc(Integer.class, "col1", "T", false); ExprNodeColumnDesc colExprB = new ExprNodeColumnDesc(Integer.class, "col2", "T", false); + + List children = new ArrayList(); children.add(colExprA); - exprNodeDesc.setChildren(children); + ExprNodeGenericFuncDesc exprNodeDesc = new ExprNodeGenericFuncDesc(TypeInfoFactory.intTypeInfo, new GenericUDFAbs(), children); + + ArrayList params = new ArrayList(); + params.add(exprNodeDesc); + + List paramOIs = new ArrayList(); + paramOIs.add(exprNodeDesc.getWritableObjectInspector()); + + AggregationDesc aggDesc = new AggregationDesc("sum", + FunctionRegistry.getGenericUDAFEvaluator("sum", paramOIs, false, false), + params, + false, + GenericUDAFEvaluator.Mode.PARTIAL1); ArrayList outputColumnNames = new ArrayList(); outputColumnNames.add("_col0"); diff --git a/ql/src/test/queries/clientpositive/dynamic_semijoin_reduction.q b/ql/src/test/queries/clientpositive/dynamic_semijoin_reduction.q new file mode 100644 index 0000000..13797c0 --- /dev/null +++ b/ql/src/test/queries/clientpositive/dynamic_semijoin_reduction.q @@ -0,0 +1,68 @@ +set hive.compute.query.using.stats=false; +set hive.mapred.mode=nonstrict; +set hive.explain.user=false; +set hive.optimize.ppd=true; +set hive.ppd.remove.duplicatefilters=true; +set hive.tez.dynamic.partition.pruning=true; +set hive.tez.dynamic.semijoin.reduction=true; +set hive.optimize.metadataonly=false; +set hive.optimize.index.filter=true; + +-- Create Tables +create table alltypesorc_int ( cint int, cstring string ) stored as ORC; +create table srcpart_date (key string, value string) partitioned by (ds string ) stored as ORC; +CREATE TABLE srcpart_small(key1 STRING, value1 STRING) partitioned by (ds string) STORED as ORC; + +-- Add Partitions +alter table srcpart_date add partition (ds = "2008-04-08"); +alter table srcpart_date add partition (ds = "2008-04-09"); + +alter table srcpart_small add partition (ds = "2008-04-08"); +alter table srcpart_small add partition (ds = "2008-04-09"); + +-- Load +insert overwrite table alltypesorc_int select cint, cstring1 from alltypesorc; +insert overwrite table srcpart_date partition (ds = "2008-04-08" ) select key, value from srcpart where ds = "2008-04-08"; +insert overwrite table srcpart_date partition (ds = "2008-04-09") select key, value from srcpart where ds = "2008-04-09"; +insert overwrite table srcpart_small partition (ds = "2008-04-09") select key, value from srcpart where ds = "2008-04-09"; +set hive.tez.dynamic.semijoin.reduction=false; + +-- single column, single key +EXPLAIN select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.key1); +select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.key1); +set hive.tez.dynamic.semijoin.reduction=true; +EXPLAIN select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.key1); +select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.key1); +set hive.tez.dynamic.semijoin.reduction=true; + +-- Mix dynamic partition pruning(DPP) and min/max bloom filter optimizations. Should pick the DPP. +EXPLAIN select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.ds); +select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.ds); +set hive.tez.dynamic.semijoin.reduction=false; + +--multiple sources, single key +EXPLAIN select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.key1) join alltypesorc_int on (srcpart_small.key1 = alltypesorc_int.cstring); +select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.key1) join alltypesorc_int on (srcpart_small.key1 = alltypesorc_int.cstring); +set hive.tez.dynamic.semijoin.reduction=true; +EXPLAIN select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.key1) join alltypesorc_int on (srcpart_small.key1 = alltypesorc_int.cstring); +select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.key1) join alltypesorc_int on (srcpart_small.key1 = alltypesorc_int.cstring); +set hive.tez.dynamic.semijoin.reduction=false; + +-- single source, multiple keys +EXPLAIN select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.key1 and srcpart_date.value = srcpart_small.value1); +select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.key1 and srcpart_date.value = srcpart_small.value1); +set hive.tez.dynamic.semijoin.reduction=true; +select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.key1 and srcpart_date.value = srcpart_small.value1); +EXPLAIN select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.key1 and srcpart_date.value = srcpart_small.value1); +set hive.tez.dynamic.semijoin.reduction=false; + +-- multiple sources, different keys +EXPLAIN select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.key1) join alltypesorc_int on (srcpart_date.value = alltypesorc_int.cstring); +select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.key1) join alltypesorc_int on (srcpart_date.value = alltypesorc_int.cstring); +set hive.tez.dynamic.semijoin.reduction=true; +EXPLAIN select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.key1) join alltypesorc_int on (srcpart_date.value = alltypesorc_int.cstring); +select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.key1) join alltypesorc_int on (srcpart_date.value = alltypesorc_int.cstring); + +drop table srcpart_date; +drop table srcpart_small; +drop table alltypesorc_int; diff --git a/ql/src/test/results/clientpositive/llap/dynamic_partition_pruning.q.out b/ql/src/test/results/clientpositive/llap/dynamic_partition_pruning.q.out index c37f3b9..e2310d7 100644 --- a/ql/src/test/results/clientpositive/llap/dynamic_partition_pruning.q.out +++ b/ql/src/test/results/clientpositive/llap/dynamic_partition_pruning.q.out @@ -3027,6 +3027,21 @@ STAGE PLANS: sort order: + Map-reduce partition columns: _col0 (type: string) Statistics: Num rows: 1 Data size: 5 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 5 Basic stats: COMPLETE Column stats: NONE + Group By Operator + keys: _col0 (type: string) + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 5 Basic stats: COMPLETE Column stats: NONE + Dynamic Partitioning Event Operator + Target column: hr (string) + Target Input: srcpart + Partition key expr: hr + Statistics: Num rows: 1 Data size: 5 Basic stats: COMPLETE Column stats: NONE + Target Vertex: Map 1 Execution mode: llap LLAP IO: no inputs Reducer 2 @@ -3120,9 +3135,11 @@ STAGE PLANS: Tez #### A masked pattern was here #### Edges: - Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 5 (SIMPLE_EDGE) - Reducer 3 <- Map 6 (SIMPLE_EDGE), Reducer 2 (SIMPLE_EDGE) + Map 7 <- Reducer 5 (BROADCAST_EDGE) + Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 6 (SIMPLE_EDGE) + Reducer 3 <- Map 7 (SIMPLE_EDGE), Reducer 2 (SIMPLE_EDGE) Reducer 4 <- Reducer 3 (CUSTOM_SIMPLE_EDGE) + Reducer 5 <- Reducer 2 (CUSTOM_SIMPLE_EDGE) #### A masked pattern was here #### Vertices: Map 1 @@ -3146,7 +3163,7 @@ STAGE PLANS: value expressions: _col1 (type: string) Execution mode: llap LLAP IO: unknown - Map 5 + Map 6 Map Operator Tree: TableScan alias: srcpart_date @@ -3166,14 +3183,14 @@ STAGE PLANS: Statistics: Num rows: 1 Data size: 21 Basic stats: COMPLETE Column stats: NONE Execution mode: llap LLAP IO: no inputs - Map 6 + Map 7 Map Operator Tree: TableScan alias: srcpart_hour - filterExpr: (UDFToDouble(hr) = 13.0) (type: boolean) + filterExpr: ((UDFToDouble(hr) = 13.0) and hr BETWEEN DynamicValue(RS_12_srcpart_hr_min) AND DynamicValue(RS_12_srcpart_hr_max) and in_bloom_filter(hr, DynamicValue(RS_12_srcpart_hr_bloom_filter))) (type: boolean) Statistics: Num rows: 2 Data size: 10 Basic stats: COMPLETE Column stats: NONE Filter Operator - predicate: (UDFToDouble(hr) = 13.0) (type: boolean) + predicate: ((UDFToDouble(hr) = 13.0) and hr BETWEEN DynamicValue(RS_12_srcpart_hr_min) AND DynamicValue(RS_12_srcpart_hr_max) and in_bloom_filter(hr, DynamicValue(RS_12_srcpart_hr_bloom_filter))) (type: boolean) Statistics: Num rows: 1 Data size: 5 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: hr (type: string) @@ -3202,6 +3219,19 @@ STAGE PLANS: sort order: + Map-reduce partition columns: _col1 (type: string) Statistics: Num rows: 1 Data size: 404 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col1 (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 404 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: min(_col0), max(_col0), bloom_filter(_col0, expectedEntries=1) + mode: hash + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: string), _col1 (type: string), _col2 (type: binary) Reducer 3 Execution mode: llap Reduce Operator Tree: @@ -3236,6 +3266,18 @@ STAGE PLANS: input format: org.apache.hadoop.mapred.SequenceFileInputFormat output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Reducer 5 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: min(VALUE._col0), max(VALUE._col1), bloom_filter(VALUE._col2, expectedEntries=1) + mode: final + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: string), _col1 (type: string), _col2 (type: binary) Stage: Stage-0 Fetch Operator @@ -5402,6 +5444,21 @@ STAGE PLANS: sort order: + Map-reduce partition columns: _col0 (type: string) Statistics: Num rows: 1 Data size: 5 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 5 Basic stats: COMPLETE Column stats: NONE + Group By Operator + keys: _col0 (type: string) + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 5 Basic stats: COMPLETE Column stats: NONE + Dynamic Partitioning Event Operator + Target column: hr (string) + Target Input: srcpart + Partition key expr: hr + Statistics: Num rows: 1 Data size: 5 Basic stats: COMPLETE Column stats: NONE + Target Vertex: Map 1 Execution mode: llap LLAP IO: no inputs Reducer 2 diff --git a/ql/src/test/results/clientpositive/llap/dynamic_semijoin_reduction.q.out b/ql/src/test/results/clientpositive/llap/dynamic_semijoin_reduction.q.out new file mode 100644 index 0000000..e89526e --- /dev/null +++ b/ql/src/test/results/clientpositive/llap/dynamic_semijoin_reduction.q.out @@ -0,0 +1,1535 @@ +PREHOOK: query: create table alltypesorc_int ( cint int, cstring string ) stored as ORC +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@alltypesorc_int +POSTHOOK: query: create table alltypesorc_int ( cint int, cstring string ) stored as ORC +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@alltypesorc_int +PREHOOK: query: create table srcpart_date (key string, value string) partitioned by (ds string ) stored as ORC +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@srcpart_date +POSTHOOK: query: create table srcpart_date (key string, value string) partitioned by (ds string ) stored as ORC +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@srcpart_date +PREHOOK: query: CREATE TABLE srcpart_small(key1 STRING, value1 STRING) partitioned by (ds string) STORED as ORC +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@srcpart_small +POSTHOOK: query: CREATE TABLE srcpart_small(key1 STRING, value1 STRING) partitioned by (ds string) STORED as ORC +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@srcpart_small +PREHOOK: query: alter table srcpart_date add partition (ds = "2008-04-08") +PREHOOK: type: ALTERTABLE_ADDPARTS +PREHOOK: Output: default@srcpart_date +POSTHOOK: query: alter table srcpart_date add partition (ds = "2008-04-08") +POSTHOOK: type: ALTERTABLE_ADDPARTS +POSTHOOK: Output: default@srcpart_date +POSTHOOK: Output: default@srcpart_date@ds=2008-04-08 +PREHOOK: query: alter table srcpart_date add partition (ds = "2008-04-09") +PREHOOK: type: ALTERTABLE_ADDPARTS +PREHOOK: Output: default@srcpart_date +POSTHOOK: query: alter table srcpart_date add partition (ds = "2008-04-09") +POSTHOOK: type: ALTERTABLE_ADDPARTS +POSTHOOK: Output: default@srcpart_date +POSTHOOK: Output: default@srcpart_date@ds=2008-04-09 +PREHOOK: query: alter table srcpart_small add partition (ds = "2008-04-08") +PREHOOK: type: ALTERTABLE_ADDPARTS +PREHOOK: Output: default@srcpart_small +POSTHOOK: query: alter table srcpart_small add partition (ds = "2008-04-08") +POSTHOOK: type: ALTERTABLE_ADDPARTS +POSTHOOK: Output: default@srcpart_small +POSTHOOK: Output: default@srcpart_small@ds=2008-04-08 +PREHOOK: query: alter table srcpart_small add partition (ds = "2008-04-09") +PREHOOK: type: ALTERTABLE_ADDPARTS +PREHOOK: Output: default@srcpart_small +POSTHOOK: query: alter table srcpart_small add partition (ds = "2008-04-09") +POSTHOOK: type: ALTERTABLE_ADDPARTS +POSTHOOK: Output: default@srcpart_small +POSTHOOK: Output: default@srcpart_small@ds=2008-04-09 +PREHOOK: query: insert overwrite table alltypesorc_int select cint, cstring1 from alltypesorc +PREHOOK: type: QUERY +PREHOOK: Input: default@alltypesorc +PREHOOK: Output: default@alltypesorc_int +POSTHOOK: query: insert overwrite table alltypesorc_int select cint, cstring1 from alltypesorc +POSTHOOK: type: QUERY +POSTHOOK: Input: default@alltypesorc +POSTHOOK: Output: default@alltypesorc_int +POSTHOOK: Lineage: alltypesorc_int.cint SIMPLE [(alltypesorc)alltypesorc.FieldSchema(name:cint, type:int, comment:null), ] +POSTHOOK: Lineage: alltypesorc_int.cstring SIMPLE [(alltypesorc)alltypesorc.FieldSchema(name:cstring1, type:string, comment:null), ] +PREHOOK: query: insert overwrite table srcpart_date partition (ds = "2008-04-08" ) select key, value from srcpart where ds = "2008-04-08" +PREHOOK: type: QUERY +PREHOOK: Input: default@srcpart +PREHOOK: Input: default@srcpart@ds=2008-04-08/hr=11 +PREHOOK: Input: default@srcpart@ds=2008-04-08/hr=12 +PREHOOK: Output: default@srcpart_date@ds=2008-04-08 +POSTHOOK: query: insert overwrite table srcpart_date partition (ds = "2008-04-08" ) select key, value from srcpart where ds = "2008-04-08" +POSTHOOK: type: QUERY +POSTHOOK: Input: default@srcpart +POSTHOOK: Input: default@srcpart@ds=2008-04-08/hr=11 +POSTHOOK: Input: default@srcpart@ds=2008-04-08/hr=12 +POSTHOOK: Output: default@srcpart_date@ds=2008-04-08 +POSTHOOK: Lineage: srcpart_date PARTITION(ds=2008-04-08).key SIMPLE [(srcpart)srcpart.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: srcpart_date PARTITION(ds=2008-04-08).value SIMPLE [(srcpart)srcpart.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: insert overwrite table srcpart_date partition (ds = "2008-04-09") select key, value from srcpart where ds = "2008-04-09" +PREHOOK: type: QUERY +PREHOOK: Input: default@srcpart +PREHOOK: Input: default@srcpart@ds=2008-04-09/hr=11 +PREHOOK: Input: default@srcpart@ds=2008-04-09/hr=12 +PREHOOK: Output: default@srcpart_date@ds=2008-04-09 +POSTHOOK: query: insert overwrite table srcpart_date partition (ds = "2008-04-09") select key, value from srcpart where ds = "2008-04-09" +POSTHOOK: type: QUERY +POSTHOOK: Input: default@srcpart +POSTHOOK: Input: default@srcpart@ds=2008-04-09/hr=11 +POSTHOOK: Input: default@srcpart@ds=2008-04-09/hr=12 +POSTHOOK: Output: default@srcpart_date@ds=2008-04-09 +POSTHOOK: Lineage: srcpart_date PARTITION(ds=2008-04-09).key SIMPLE [(srcpart)srcpart.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: srcpart_date PARTITION(ds=2008-04-09).value SIMPLE [(srcpart)srcpart.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: insert overwrite table srcpart_small partition (ds = "2008-04-09") select key, value from srcpart where ds = "2008-04-09" +PREHOOK: type: QUERY +PREHOOK: Input: default@srcpart +PREHOOK: Input: default@srcpart@ds=2008-04-09/hr=11 +PREHOOK: Input: default@srcpart@ds=2008-04-09/hr=12 +PREHOOK: Output: default@srcpart_small@ds=2008-04-09 +POSTHOOK: query: insert overwrite table srcpart_small partition (ds = "2008-04-09") select key, value from srcpart where ds = "2008-04-09" +POSTHOOK: type: QUERY +POSTHOOK: Input: default@srcpart +POSTHOOK: Input: default@srcpart@ds=2008-04-09/hr=11 +POSTHOOK: Input: default@srcpart@ds=2008-04-09/hr=12 +POSTHOOK: Output: default@srcpart_small@ds=2008-04-09 +POSTHOOK: Lineage: srcpart_small PARTITION(ds=2008-04-09).key1 SIMPLE [(srcpart)srcpart.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: srcpart_small PARTITION(ds=2008-04-09).value1 SIMPLE [(srcpart)srcpart.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: EXPLAIN select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.key1) +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.key1) +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 4 (SIMPLE_EDGE) + Reducer 3 <- Reducer 2 (CUSTOM_SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: srcpart_date + filterExpr: key is not null (type: boolean) + Statistics: Num rows: 2000 Data size: 368000 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: key is not null (type: boolean) + Statistics: Num rows: 2000 Data size: 368000 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 2000 Data size: 368000 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 2000 Data size: 368000 Basic stats: COMPLETE Column stats: NONE + Execution mode: llap + LLAP IO: all inputs + Map 4 + Map Operator Tree: + TableScan + alias: srcpart_small + filterExpr: key1 is not null (type: boolean) + Statistics: Num rows: 1000 Data size: 184000 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: key1 is not null (type: boolean) + Statistics: Num rows: 1000 Data size: 184000 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key1 (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 1000 Data size: 184000 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 1000 Data size: 184000 Basic stats: COMPLETE Column stats: NONE + Execution mode: llap + LLAP IO: all inputs + Reducer 2 + Execution mode: llap + Reduce Operator Tree: + Merge Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 _col0 (type: string) + 1 _col0 (type: string) + Statistics: Num rows: 2200 Data size: 404800 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count() + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: bigint) + Reducer 3 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.key1) +PREHOOK: type: QUERY +PREHOOK: Input: default@srcpart_date +PREHOOK: Input: default@srcpart_date@ds=2008-04-08 +PREHOOK: Input: default@srcpart_date@ds=2008-04-09 +PREHOOK: Input: default@srcpart_small +PREHOOK: Input: default@srcpart_small@ds=2008-04-08 +PREHOOK: Input: default@srcpart_small@ds=2008-04-09 +#### A masked pattern was here #### +POSTHOOK: query: select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.key1) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@srcpart_date +POSTHOOK: Input: default@srcpart_date@ds=2008-04-08 +POSTHOOK: Input: default@srcpart_date@ds=2008-04-09 +POSTHOOK: Input: default@srcpart_small +POSTHOOK: Input: default@srcpart_small@ds=2008-04-08 +POSTHOOK: Input: default@srcpart_small@ds=2008-04-09 +#### A masked pattern was here #### +8224 +PREHOOK: query: EXPLAIN select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.key1) +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.key1) +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Map 1 <- Reducer 5 (BROADCAST_EDGE) + Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 4 (SIMPLE_EDGE) + Reducer 3 <- Reducer 2 (CUSTOM_SIMPLE_EDGE) + Reducer 5 <- Map 4 (CUSTOM_SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: srcpart_date + filterExpr: (key is not null and key BETWEEN DynamicValue(RS_7_srcpart_small_key_min) AND DynamicValue(RS_7_srcpart_small_key_max) and in_bloom_filter(key, DynamicValue(RS_7_srcpart_small_key_bloom_filter))) (type: boolean) + Statistics: Num rows: 2000 Data size: 368000 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (key is not null and key BETWEEN DynamicValue(RS_7_srcpart_small_key_min) AND DynamicValue(RS_7_srcpart_small_key_max) and in_bloom_filter(key, DynamicValue(RS_7_srcpart_small_key_bloom_filter))) (type: boolean) + Statistics: Num rows: 2000 Data size: 368000 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 2000 Data size: 368000 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 2000 Data size: 368000 Basic stats: COMPLETE Column stats: NONE + Execution mode: llap + LLAP IO: all inputs + Map 4 + Map Operator Tree: + TableScan + alias: srcpart_small + filterExpr: key1 is not null (type: boolean) + Statistics: Num rows: 1000 Data size: 184000 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: key1 is not null (type: boolean) + Statistics: Num rows: 1000 Data size: 184000 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key1 (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 1000 Data size: 184000 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 1000 Data size: 184000 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 1000 Data size: 184000 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: min(_col0), max(_col0), bloom_filter(_col0, expectedEntries=1000) + mode: hash + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: string), _col1 (type: string), _col2 (type: binary) + Execution mode: llap + LLAP IO: all inputs + Reducer 2 + Execution mode: llap + Reduce Operator Tree: + Merge Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 _col0 (type: string) + 1 _col0 (type: string) + Statistics: Num rows: 2200 Data size: 404800 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count() + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: bigint) + Reducer 3 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Reducer 5 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: min(VALUE._col0), max(VALUE._col1), bloom_filter(VALUE._col2, expectedEntries=1000) + mode: final + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: string), _col1 (type: string), _col2 (type: binary) + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.key1) +PREHOOK: type: QUERY +PREHOOK: Input: default@srcpart_date +PREHOOK: Input: default@srcpart_date@ds=2008-04-08 +PREHOOK: Input: default@srcpart_date@ds=2008-04-09 +PREHOOK: Input: default@srcpart_small +PREHOOK: Input: default@srcpart_small@ds=2008-04-08 +PREHOOK: Input: default@srcpart_small@ds=2008-04-09 +#### A masked pattern was here #### +POSTHOOK: query: select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.key1) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@srcpart_date +POSTHOOK: Input: default@srcpart_date@ds=2008-04-08 +POSTHOOK: Input: default@srcpart_date@ds=2008-04-09 +POSTHOOK: Input: default@srcpart_small +POSTHOOK: Input: default@srcpart_small@ds=2008-04-08 +POSTHOOK: Input: default@srcpart_small@ds=2008-04-09 +#### A masked pattern was here #### +8224 +PREHOOK: query: EXPLAIN select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.ds) +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.ds) +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 4 (SIMPLE_EDGE) + Reducer 3 <- Reducer 2 (CUSTOM_SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: srcpart_date + filterExpr: key is not null (type: boolean) + Statistics: Num rows: 2000 Data size: 368000 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: key is not null (type: boolean) + Statistics: Num rows: 2000 Data size: 368000 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 2000 Data size: 368000 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 2000 Data size: 368000 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 2000 Data size: 368000 Basic stats: COMPLETE Column stats: NONE + Group By Operator + keys: _col0 (type: string) + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 2000 Data size: 368000 Basic stats: COMPLETE Column stats: NONE + Dynamic Partitioning Event Operator + Target column: ds (string) + Target Input: srcpart_small + Partition key expr: ds + Statistics: Num rows: 2000 Data size: 368000 Basic stats: COMPLETE Column stats: NONE + Target Vertex: Map 4 + Execution mode: llap + LLAP IO: all inputs + Map 4 + Map Operator Tree: + TableScan + alias: srcpart_small + filterExpr: ds is not null (type: boolean) + Statistics: Num rows: 1000 Data size: 360000 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: ds (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 1000 Data size: 184000 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 1000 Data size: 184000 Basic stats: COMPLETE Column stats: COMPLETE + Execution mode: llap + LLAP IO: all inputs + Reducer 2 + Execution mode: llap + Reduce Operator Tree: + Merge Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 _col0 (type: string) + 1 _col0 (type: string) + Statistics: Num rows: 2200 Data size: 404800 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count() + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: bigint) + Reducer 3 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.ds) +PREHOOK: type: QUERY +PREHOOK: Input: default@srcpart_date +PREHOOK: Input: default@srcpart_date@ds=2008-04-08 +PREHOOK: Input: default@srcpart_date@ds=2008-04-09 +PREHOOK: Input: default@srcpart_small +PREHOOK: Input: default@srcpart_small@ds=2008-04-08 +PREHOOK: Input: default@srcpart_small@ds=2008-04-09 +#### A masked pattern was here #### +POSTHOOK: query: select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.ds) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@srcpart_date +POSTHOOK: Input: default@srcpart_date@ds=2008-04-08 +POSTHOOK: Input: default@srcpart_date@ds=2008-04-09 +POSTHOOK: Input: default@srcpart_small +POSTHOOK: Input: default@srcpart_small@ds=2008-04-08 +POSTHOOK: Input: default@srcpart_small@ds=2008-04-09 +#### A masked pattern was here #### +0 +PREHOOK: query: EXPLAIN select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.key1) join alltypesorc_int on (srcpart_small.key1 = alltypesorc_int.cstring) +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.key1) join alltypesorc_int on (srcpart_small.key1 = alltypesorc_int.cstring) +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 4 (SIMPLE_EDGE), Map 5 (SIMPLE_EDGE) + Reducer 3 <- Reducer 2 (CUSTOM_SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: srcpart_date + filterExpr: key is not null (type: boolean) + Statistics: Num rows: 2000 Data size: 368000 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: key is not null (type: boolean) + Statistics: Num rows: 2000 Data size: 368000 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 2000 Data size: 368000 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 2000 Data size: 368000 Basic stats: COMPLETE Column stats: NONE + Execution mode: llap + LLAP IO: all inputs + Map 4 + Map Operator Tree: + TableScan + alias: srcpart_small + filterExpr: key1 is not null (type: boolean) + Statistics: Num rows: 1000 Data size: 184000 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: key1 is not null (type: boolean) + Statistics: Num rows: 1000 Data size: 184000 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key1 (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 1000 Data size: 184000 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 1000 Data size: 184000 Basic stats: COMPLETE Column stats: NONE + Execution mode: llap + LLAP IO: all inputs + Map 5 + Map Operator Tree: + TableScan + alias: alltypesorc_int + filterExpr: cstring is not null (type: boolean) + Statistics: Num rows: 12288 Data size: 926570 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: cstring is not null (type: boolean) + Statistics: Num rows: 12288 Data size: 926570 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: cstring (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 12288 Data size: 926570 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 12288 Data size: 926570 Basic stats: COMPLETE Column stats: NONE + Execution mode: llap + LLAP IO: all inputs + Reducer 2 + Execution mode: llap + Reduce Operator Tree: + Merge Join Operator + condition map: + Inner Join 0 to 1 + Inner Join 1 to 2 + keys: + 0 _col0 (type: string) + 1 _col0 (type: string) + 2 _col0 (type: string) + Statistics: Num rows: 27033 Data size: 2038454 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count() + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: bigint) + Reducer 3 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.key1) join alltypesorc_int on (srcpart_small.key1 = alltypesorc_int.cstring) +PREHOOK: type: QUERY +PREHOOK: Input: default@alltypesorc_int +PREHOOK: Input: default@srcpart_date +PREHOOK: Input: default@srcpart_date@ds=2008-04-08 +PREHOOK: Input: default@srcpart_date@ds=2008-04-09 +PREHOOK: Input: default@srcpart_small +PREHOOK: Input: default@srcpart_small@ds=2008-04-08 +PREHOOK: Input: default@srcpart_small@ds=2008-04-09 +#### A masked pattern was here #### +POSTHOOK: query: select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.key1) join alltypesorc_int on (srcpart_small.key1 = alltypesorc_int.cstring) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@alltypesorc_int +POSTHOOK: Input: default@srcpart_date +POSTHOOK: Input: default@srcpart_date@ds=2008-04-08 +POSTHOOK: Input: default@srcpart_date@ds=2008-04-09 +POSTHOOK: Input: default@srcpart_small +POSTHOOK: Input: default@srcpart_small@ds=2008-04-08 +POSTHOOK: Input: default@srcpart_small@ds=2008-04-09 +#### A masked pattern was here #### +48 +PREHOOK: query: EXPLAIN select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.key1) join alltypesorc_int on (srcpart_small.key1 = alltypesorc_int.cstring) +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.key1) join alltypesorc_int on (srcpart_small.key1 = alltypesorc_int.cstring) +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Map 1 <- Reducer 6 (BROADCAST_EDGE) + Map 8 <- Reducer 4 (BROADCAST_EDGE), Reducer 7 (BROADCAST_EDGE) + Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 5 (SIMPLE_EDGE), Map 8 (SIMPLE_EDGE) + Reducer 3 <- Reducer 2 (CUSTOM_SIMPLE_EDGE) + Reducer 4 <- Map 1 (CUSTOM_SIMPLE_EDGE) + Reducer 6 <- Map 5 (CUSTOM_SIMPLE_EDGE) + Reducer 7 <- Map 5 (CUSTOM_SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: srcpart_date + filterExpr: (key is not null and key BETWEEN DynamicValue(RS_10_srcpart_small_key_min) AND DynamicValue(RS_10_srcpart_small_key_max) and in_bloom_filter(key, DynamicValue(RS_10_srcpart_small_key_bloom_filter))) (type: boolean) + Statistics: Num rows: 2000 Data size: 368000 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (key is not null and key BETWEEN DynamicValue(RS_10_srcpart_small_key_min) AND DynamicValue(RS_10_srcpart_small_key_max) and in_bloom_filter(key, DynamicValue(RS_10_srcpart_small_key_bloom_filter))) (type: boolean) + Statistics: Num rows: 2000 Data size: 368000 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 2000 Data size: 368000 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 2000 Data size: 368000 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 2000 Data size: 368000 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: min(_col0), max(_col0), bloom_filter(_col0, expectedEntries=2000) + mode: hash + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: string), _col1 (type: string), _col2 (type: binary) + Execution mode: llap + LLAP IO: all inputs + Map 5 + Map Operator Tree: + TableScan + alias: srcpart_small + filterExpr: key1 is not null (type: boolean) + Statistics: Num rows: 1000 Data size: 184000 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: key1 is not null (type: boolean) + Statistics: Num rows: 1000 Data size: 184000 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key1 (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 1000 Data size: 184000 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 1000 Data size: 184000 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 1000 Data size: 184000 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: min(_col0), max(_col0), bloom_filter(_col0, expectedEntries=1000) + mode: hash + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: string), _col1 (type: string), _col2 (type: binary) + Select Operator + expressions: _col0 (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 1000 Data size: 184000 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: min(_col0), max(_col0), bloom_filter(_col0, expectedEntries=1000) + mode: hash + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: string), _col1 (type: string), _col2 (type: binary) + Execution mode: llap + LLAP IO: all inputs + Map 8 + Map Operator Tree: + TableScan + alias: alltypesorc_int + filterExpr: (cstring is not null and cstring BETWEEN DynamicValue(RS_9_srcpart_date_cstring_min) AND DynamicValue(RS_9_srcpart_date_cstring_max) and cstring BETWEEN DynamicValue(RS_10_srcpart_small_cstring_min) AND DynamicValue(RS_10_srcpart_small_cstring_max) and in_bloom_filter(cstring, DynamicValue(RS_9_srcpart_date_cstring_bloom_filter)) and in_bloom_filter(cstring, DynamicValue(RS_10_srcpart_small_cstring_bloom_filter))) (type: boolean) + Statistics: Num rows: 12288 Data size: 926570 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (cstring is not null and cstring BETWEEN DynamicValue(RS_9_srcpart_date_cstring_min) AND DynamicValue(RS_9_srcpart_date_cstring_max) and cstring BETWEEN DynamicValue(RS_10_srcpart_small_cstring_min) AND DynamicValue(RS_10_srcpart_small_cstring_max) and in_bloom_filter(cstring, DynamicValue(RS_9_srcpart_date_cstring_bloom_filter)) and in_bloom_filter(cstring, DynamicValue(RS_10_srcpart_small_cstring_bloom_filter))) (type: boolean) + Statistics: Num rows: 12288 Data size: 926570 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: cstring (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 12288 Data size: 926570 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 12288 Data size: 926570 Basic stats: COMPLETE Column stats: NONE + Execution mode: llap + LLAP IO: all inputs + Reducer 2 + Execution mode: llap + Reduce Operator Tree: + Merge Join Operator + condition map: + Inner Join 0 to 1 + Inner Join 1 to 2 + keys: + 0 _col0 (type: string) + 1 _col0 (type: string) + 2 _col0 (type: string) + Statistics: Num rows: 27033 Data size: 2038454 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count() + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: bigint) + Reducer 3 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Reducer 4 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: min(VALUE._col0), max(VALUE._col1), bloom_filter(VALUE._col2, expectedEntries=2000) + mode: final + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: string), _col1 (type: string), _col2 (type: binary) + Reducer 6 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: min(VALUE._col0), max(VALUE._col1), bloom_filter(VALUE._col2, expectedEntries=1000) + mode: final + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: string), _col1 (type: string), _col2 (type: binary) + Reducer 7 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: min(VALUE._col0), max(VALUE._col1), bloom_filter(VALUE._col2, expectedEntries=1000) + mode: final + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: string), _col1 (type: string), _col2 (type: binary) + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.key1) join alltypesorc_int on (srcpart_small.key1 = alltypesorc_int.cstring) +PREHOOK: type: QUERY +PREHOOK: Input: default@alltypesorc_int +PREHOOK: Input: default@srcpart_date +PREHOOK: Input: default@srcpart_date@ds=2008-04-08 +PREHOOK: Input: default@srcpart_date@ds=2008-04-09 +PREHOOK: Input: default@srcpart_small +PREHOOK: Input: default@srcpart_small@ds=2008-04-08 +PREHOOK: Input: default@srcpart_small@ds=2008-04-09 +#### A masked pattern was here #### +POSTHOOK: query: select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.key1) join alltypesorc_int on (srcpart_small.key1 = alltypesorc_int.cstring) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@alltypesorc_int +POSTHOOK: Input: default@srcpart_date +POSTHOOK: Input: default@srcpart_date@ds=2008-04-08 +POSTHOOK: Input: default@srcpart_date@ds=2008-04-09 +POSTHOOK: Input: default@srcpart_small +POSTHOOK: Input: default@srcpart_small@ds=2008-04-08 +POSTHOOK: Input: default@srcpart_small@ds=2008-04-09 +#### A masked pattern was here #### +48 +PREHOOK: query: EXPLAIN select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.key1 and srcpart_date.value = srcpart_small.value1) +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.key1 and srcpart_date.value = srcpart_small.value1) +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 4 (SIMPLE_EDGE) + Reducer 3 <- Reducer 2 (CUSTOM_SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: srcpart_date + filterExpr: (key is not null and value is not null) (type: boolean) + Statistics: Num rows: 2000 Data size: 368000 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (key is not null and value is not null) (type: boolean) + Statistics: Num rows: 2000 Data size: 368000 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string), value (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 2000 Data size: 368000 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string), _col1 (type: string) + sort order: ++ + Map-reduce partition columns: _col0 (type: string), _col1 (type: string) + Statistics: Num rows: 2000 Data size: 368000 Basic stats: COMPLETE Column stats: NONE + Execution mode: llap + LLAP IO: all inputs + Map 4 + Map Operator Tree: + TableScan + alias: srcpart_small + filterExpr: (key1 is not null and value1 is not null) (type: boolean) + Statistics: Num rows: 1000 Data size: 184000 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (key1 is not null and value1 is not null) (type: boolean) + Statistics: Num rows: 1000 Data size: 184000 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key1 (type: string), value1 (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1000 Data size: 184000 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string), _col1 (type: string) + sort order: ++ + Map-reduce partition columns: _col0 (type: string), _col1 (type: string) + Statistics: Num rows: 1000 Data size: 184000 Basic stats: COMPLETE Column stats: NONE + Execution mode: llap + LLAP IO: all inputs + Reducer 2 + Execution mode: llap + Reduce Operator Tree: + Merge Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 _col0 (type: string), _col1 (type: string) + 1 _col0 (type: string), _col1 (type: string) + Statistics: Num rows: 2200 Data size: 404800 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count() + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: bigint) + Reducer 3 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.key1 and srcpart_date.value = srcpart_small.value1) +PREHOOK: type: QUERY +PREHOOK: Input: default@srcpart_date +PREHOOK: Input: default@srcpart_date@ds=2008-04-08 +PREHOOK: Input: default@srcpart_date@ds=2008-04-09 +PREHOOK: Input: default@srcpart_small +PREHOOK: Input: default@srcpart_small@ds=2008-04-08 +PREHOOK: Input: default@srcpart_small@ds=2008-04-09 +#### A masked pattern was here #### +POSTHOOK: query: select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.key1 and srcpart_date.value = srcpart_small.value1) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@srcpart_date +POSTHOOK: Input: default@srcpart_date@ds=2008-04-08 +POSTHOOK: Input: default@srcpart_date@ds=2008-04-09 +POSTHOOK: Input: default@srcpart_small +POSTHOOK: Input: default@srcpart_small@ds=2008-04-08 +POSTHOOK: Input: default@srcpart_small@ds=2008-04-09 +#### A masked pattern was here #### +8224 +PREHOOK: query: select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.key1 and srcpart_date.value = srcpart_small.value1) +PREHOOK: type: QUERY +PREHOOK: Input: default@srcpart_date +PREHOOK: Input: default@srcpart_date@ds=2008-04-08 +PREHOOK: Input: default@srcpart_date@ds=2008-04-09 +PREHOOK: Input: default@srcpart_small +PREHOOK: Input: default@srcpart_small@ds=2008-04-08 +PREHOOK: Input: default@srcpart_small@ds=2008-04-09 +#### A masked pattern was here #### +POSTHOOK: query: select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.key1 and srcpart_date.value = srcpart_small.value1) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@srcpart_date +POSTHOOK: Input: default@srcpart_date@ds=2008-04-08 +POSTHOOK: Input: default@srcpart_date@ds=2008-04-09 +POSTHOOK: Input: default@srcpart_small +POSTHOOK: Input: default@srcpart_small@ds=2008-04-08 +POSTHOOK: Input: default@srcpart_small@ds=2008-04-09 +#### A masked pattern was here #### +8224 +PREHOOK: query: EXPLAIN select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.key1 and srcpart_date.value = srcpart_small.value1) +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.key1 and srcpart_date.value = srcpart_small.value1) +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Map 1 <- Reducer 5 (BROADCAST_EDGE), Reducer 6 (BROADCAST_EDGE) + Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 4 (SIMPLE_EDGE) + Reducer 3 <- Reducer 2 (CUSTOM_SIMPLE_EDGE) + Reducer 5 <- Map 4 (CUSTOM_SIMPLE_EDGE) + Reducer 6 <- Map 4 (CUSTOM_SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: srcpart_date + filterExpr: (key is not null and value is not null and key BETWEEN DynamicValue(RS_7_srcpart_small_key_min) AND DynamicValue(RS_7_srcpart_small_key_max) and value BETWEEN DynamicValue(RS_7_srcpart_small_value_min) AND DynamicValue(RS_7_srcpart_small_value_max) and in_bloom_filter(key, DynamicValue(RS_7_srcpart_small_key_bloom_filter)) and in_bloom_filter(value, DynamicValue(RS_7_srcpart_small_value_bloom_filter))) (type: boolean) + Statistics: Num rows: 2000 Data size: 368000 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (key is not null and value is not null and key BETWEEN DynamicValue(RS_7_srcpart_small_key_min) AND DynamicValue(RS_7_srcpart_small_key_max) and value BETWEEN DynamicValue(RS_7_srcpart_small_value_min) AND DynamicValue(RS_7_srcpart_small_value_max) and in_bloom_filter(key, DynamicValue(RS_7_srcpart_small_key_bloom_filter)) and in_bloom_filter(value, DynamicValue(RS_7_srcpart_small_value_bloom_filter))) (type: boolean) + Statistics: Num rows: 2000 Data size: 368000 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string), value (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 2000 Data size: 368000 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string), _col1 (type: string) + sort order: ++ + Map-reduce partition columns: _col0 (type: string), _col1 (type: string) + Statistics: Num rows: 2000 Data size: 368000 Basic stats: COMPLETE Column stats: NONE + Execution mode: llap + LLAP IO: all inputs + Map 4 + Map Operator Tree: + TableScan + alias: srcpart_small + filterExpr: (key1 is not null and value1 is not null) (type: boolean) + Statistics: Num rows: 1000 Data size: 184000 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (key1 is not null and value1 is not null) (type: boolean) + Statistics: Num rows: 1000 Data size: 184000 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key1 (type: string), value1 (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1000 Data size: 184000 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string), _col1 (type: string) + sort order: ++ + Map-reduce partition columns: _col0 (type: string), _col1 (type: string) + Statistics: Num rows: 1000 Data size: 184000 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 1000 Data size: 184000 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: min(_col0), max(_col0), bloom_filter(_col0, expectedEntries=1000) + mode: hash + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: string), _col1 (type: string), _col2 (type: binary) + Select Operator + expressions: _col1 (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 1000 Data size: 184000 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: min(_col0), max(_col0), bloom_filter(_col0, expectedEntries=1000) + mode: hash + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: string), _col1 (type: string), _col2 (type: binary) + Execution mode: llap + LLAP IO: all inputs + Reducer 2 + Execution mode: llap + Reduce Operator Tree: + Merge Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 _col0 (type: string), _col1 (type: string) + 1 _col0 (type: string), _col1 (type: string) + Statistics: Num rows: 2200 Data size: 404800 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count() + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: bigint) + Reducer 3 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Reducer 5 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: min(VALUE._col0), max(VALUE._col1), bloom_filter(VALUE._col2, expectedEntries=1000) + mode: final + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: string), _col1 (type: string), _col2 (type: binary) + Reducer 6 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: min(VALUE._col0), max(VALUE._col1), bloom_filter(VALUE._col2, expectedEntries=1000) + mode: final + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: string), _col1 (type: string), _col2 (type: binary) + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: EXPLAIN select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.key1) join alltypesorc_int on (srcpart_date.value = alltypesorc_int.cstring) +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.key1) join alltypesorc_int on (srcpart_date.value = alltypesorc_int.cstring) +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 5 (SIMPLE_EDGE) + Reducer 3 <- Map 6 (SIMPLE_EDGE), Reducer 2 (SIMPLE_EDGE) + Reducer 4 <- Reducer 3 (CUSTOM_SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: srcpart_date + filterExpr: (key is not null and value is not null) (type: boolean) + Statistics: Num rows: 2000 Data size: 368000 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (key is not null and value is not null) (type: boolean) + Statistics: Num rows: 2000 Data size: 368000 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string), value (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 2000 Data size: 368000 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 2000 Data size: 368000 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: string) + Execution mode: llap + LLAP IO: all inputs + Map 5 + Map Operator Tree: + TableScan + alias: srcpart_small + filterExpr: key1 is not null (type: boolean) + Statistics: Num rows: 1000 Data size: 184000 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: key1 is not null (type: boolean) + Statistics: Num rows: 1000 Data size: 184000 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key1 (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 1000 Data size: 184000 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 1000 Data size: 184000 Basic stats: COMPLETE Column stats: NONE + Execution mode: llap + LLAP IO: all inputs + Map 6 + Map Operator Tree: + TableScan + alias: alltypesorc_int + filterExpr: cstring is not null (type: boolean) + Statistics: Num rows: 12288 Data size: 926570 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: cstring is not null (type: boolean) + Statistics: Num rows: 12288 Data size: 926570 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: cstring (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 12288 Data size: 926570 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 12288 Data size: 926570 Basic stats: COMPLETE Column stats: NONE + Execution mode: llap + LLAP IO: all inputs + Reducer 2 + Execution mode: llap + Reduce Operator Tree: + Merge Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 _col0 (type: string) + 1 _col0 (type: string) + outputColumnNames: _col1 + Statistics: Num rows: 2200 Data size: 404800 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col1 (type: string) + sort order: + + Map-reduce partition columns: _col1 (type: string) + Statistics: Num rows: 2200 Data size: 404800 Basic stats: COMPLETE Column stats: NONE + Reducer 3 + Execution mode: llap + Reduce Operator Tree: + Merge Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 _col1 (type: string) + 1 _col0 (type: string) + Statistics: Num rows: 13516 Data size: 1019227 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count() + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: bigint) + Reducer 4 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.key1) join alltypesorc_int on (srcpart_date.value = alltypesorc_int.cstring) +PREHOOK: type: QUERY +PREHOOK: Input: default@alltypesorc_int +PREHOOK: Input: default@srcpart_date +PREHOOK: Input: default@srcpart_date@ds=2008-04-08 +PREHOOK: Input: default@srcpart_date@ds=2008-04-09 +PREHOOK: Input: default@srcpart_small +PREHOOK: Input: default@srcpart_small@ds=2008-04-08 +PREHOOK: Input: default@srcpart_small@ds=2008-04-09 +#### A masked pattern was here #### +POSTHOOK: query: select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.key1) join alltypesorc_int on (srcpart_date.value = alltypesorc_int.cstring) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@alltypesorc_int +POSTHOOK: Input: default@srcpart_date +POSTHOOK: Input: default@srcpart_date@ds=2008-04-08 +POSTHOOK: Input: default@srcpart_date@ds=2008-04-09 +POSTHOOK: Input: default@srcpart_small +POSTHOOK: Input: default@srcpart_small@ds=2008-04-08 +POSTHOOK: Input: default@srcpart_small@ds=2008-04-09 +#### A masked pattern was here #### +0 +PREHOOK: query: EXPLAIN select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.key1) join alltypesorc_int on (srcpart_date.value = alltypesorc_int.cstring) +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.key1) join alltypesorc_int on (srcpart_date.value = alltypesorc_int.cstring) +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Map 1 <- Reducer 7 (BROADCAST_EDGE) + Map 8 <- Reducer 5 (BROADCAST_EDGE) + Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 6 (SIMPLE_EDGE) + Reducer 3 <- Map 8 (SIMPLE_EDGE), Reducer 2 (SIMPLE_EDGE) + Reducer 4 <- Reducer 3 (CUSTOM_SIMPLE_EDGE) + Reducer 5 <- Reducer 2 (CUSTOM_SIMPLE_EDGE) + Reducer 7 <- Map 6 (CUSTOM_SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: srcpart_date + filterExpr: (key is not null and value is not null and key BETWEEN DynamicValue(RS_10_srcpart_small_key_min) AND DynamicValue(RS_10_srcpart_small_key_max) and in_bloom_filter(key, DynamicValue(RS_10_srcpart_small_key_bloom_filter))) (type: boolean) + Statistics: Num rows: 2000 Data size: 368000 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (key is not null and value is not null and key BETWEEN DynamicValue(RS_10_srcpart_small_key_min) AND DynamicValue(RS_10_srcpart_small_key_max) and in_bloom_filter(key, DynamicValue(RS_10_srcpart_small_key_bloom_filter))) (type: boolean) + Statistics: Num rows: 2000 Data size: 368000 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string), value (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 2000 Data size: 368000 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 2000 Data size: 368000 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: string) + Execution mode: llap + LLAP IO: all inputs + Map 6 + Map Operator Tree: + TableScan + alias: srcpart_small + filterExpr: key1 is not null (type: boolean) + Statistics: Num rows: 1000 Data size: 184000 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: key1 is not null (type: boolean) + Statistics: Num rows: 1000 Data size: 184000 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key1 (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 1000 Data size: 184000 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 1000 Data size: 184000 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 1000 Data size: 184000 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: min(_col0), max(_col0), bloom_filter(_col0, expectedEntries=1000) + mode: hash + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: string), _col1 (type: string), _col2 (type: binary) + Execution mode: llap + LLAP IO: all inputs + Map 8 + Map Operator Tree: + TableScan + alias: alltypesorc_int + filterExpr: (cstring is not null and cstring BETWEEN DynamicValue(RS_12_srcpart_date_cstring_min) AND DynamicValue(RS_12_srcpart_date_cstring_max) and in_bloom_filter(cstring, DynamicValue(RS_12_srcpart_date_cstring_bloom_filter))) (type: boolean) + Statistics: Num rows: 12288 Data size: 926570 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (cstring is not null and cstring BETWEEN DynamicValue(RS_12_srcpart_date_cstring_min) AND DynamicValue(RS_12_srcpart_date_cstring_max) and in_bloom_filter(cstring, DynamicValue(RS_12_srcpart_date_cstring_bloom_filter))) (type: boolean) + Statistics: Num rows: 12288 Data size: 926570 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: cstring (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 12288 Data size: 926570 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 12288 Data size: 926570 Basic stats: COMPLETE Column stats: NONE + Execution mode: llap + LLAP IO: all inputs + Reducer 2 + Execution mode: llap + Reduce Operator Tree: + Merge Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 _col0 (type: string) + 1 _col0 (type: string) + outputColumnNames: _col1 + Statistics: Num rows: 2200 Data size: 404800 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col1 (type: string) + sort order: + + Map-reduce partition columns: _col1 (type: string) + Statistics: Num rows: 2200 Data size: 404800 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col1 (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 2200 Data size: 404800 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: min(_col0), max(_col0), bloom_filter(_col0, expectedEntries=2200) + mode: hash + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: string), _col1 (type: string), _col2 (type: binary) + Reducer 3 + Execution mode: llap + Reduce Operator Tree: + Merge Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 _col1 (type: string) + 1 _col0 (type: string) + Statistics: Num rows: 13516 Data size: 1019227 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count() + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: bigint) + Reducer 4 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Reducer 5 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: min(VALUE._col0), max(VALUE._col1), bloom_filter(VALUE._col2, expectedEntries=2200) + mode: final + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: string), _col1 (type: string), _col2 (type: binary) + Reducer 7 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: min(VALUE._col0), max(VALUE._col1), bloom_filter(VALUE._col2, expectedEntries=1000) + mode: final + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: string), _col1 (type: string), _col2 (type: binary) + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.key1) join alltypesorc_int on (srcpart_date.value = alltypesorc_int.cstring) +PREHOOK: type: QUERY +PREHOOK: Input: default@alltypesorc_int +PREHOOK: Input: default@srcpart_date +PREHOOK: Input: default@srcpart_date@ds=2008-04-08 +PREHOOK: Input: default@srcpart_date@ds=2008-04-09 +PREHOOK: Input: default@srcpart_small +PREHOOK: Input: default@srcpart_small@ds=2008-04-08 +PREHOOK: Input: default@srcpart_small@ds=2008-04-09 +#### A masked pattern was here #### +POSTHOOK: query: select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.key1) join alltypesorc_int on (srcpart_date.value = alltypesorc_int.cstring) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@alltypesorc_int +POSTHOOK: Input: default@srcpart_date +POSTHOOK: Input: default@srcpart_date@ds=2008-04-08 +POSTHOOK: Input: default@srcpart_date@ds=2008-04-09 +POSTHOOK: Input: default@srcpart_small +POSTHOOK: Input: default@srcpart_small@ds=2008-04-08 +POSTHOOK: Input: default@srcpart_small@ds=2008-04-09 +#### A masked pattern was here #### +0 +PREHOOK: query: drop table srcpart_date +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@srcpart_date +PREHOOK: Output: default@srcpart_date +POSTHOOK: query: drop table srcpart_date +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@srcpart_date +POSTHOOK: Output: default@srcpart_date +PREHOOK: query: drop table srcpart_small +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@srcpart_small +PREHOOK: Output: default@srcpart_small +POSTHOOK: query: drop table srcpart_small +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@srcpart_small +POSTHOOK: Output: default@srcpart_small +PREHOOK: query: drop table alltypesorc_int +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@alltypesorc_int +PREHOOK: Output: default@alltypesorc_int +POSTHOOK: query: drop table alltypesorc_int +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@alltypesorc_int +POSTHOOK: Output: default@alltypesorc_int diff --git a/ql/src/test/results/clientpositive/llap/join32_lessSize.q.out b/ql/src/test/results/clientpositive/llap/join32_lessSize.q.out index 15eb751..6efd828 100644 --- a/ql/src/test/results/clientpositive/llap/join32_lessSize.q.out +++ b/ql/src/test/results/clientpositive/llap/join32_lessSize.q.out @@ -1753,7 +1753,7 @@ STAGE PLANS: 0 _col0 (type: string) 1 _col1 (type: string) outputColumnNames: _col0, _col3, _col4 - Position of Big Table: 0 + Position of Big Table: 1 Statistics: Num rows: 140 Data size: 37240 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: _col3 (type: string), _col0 (type: string), _col4 (type: string) diff --git a/ql/src/test/results/clientpositive/llap/llap_partitioned.q.out b/ql/src/test/results/clientpositive/llap/llap_partitioned.q.out index 541ece8..d35501e 100644 --- a/ql/src/test/results/clientpositive/llap/llap_partitioned.q.out +++ b/ql/src/test/results/clientpositive/llap/llap_partitioned.q.out @@ -1663,7 +1663,7 @@ STAGE PLANS: key expressions: ctinyint (type: tinyint) sort order: + Map-reduce partition columns: ctinyint (type: tinyint) - Statistics: Num rows: 10 Data size: 2640 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 10 Data size: 40 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: ctinyint (type: tinyint) outputColumnNames: _col0 diff --git a/ql/src/test/results/clientpositive/llap/mergejoin.q.out b/ql/src/test/results/clientpositive/llap/mergejoin.q.out index 9d27a10..4ec2a71 100644 --- a/ql/src/test/results/clientpositive/llap/mergejoin.q.out +++ b/ql/src/test/results/clientpositive/llap/mergejoin.q.out @@ -13,17 +13,19 @@ STAGE PLANS: Tez #### A masked pattern was here #### Edges: + Map 1 <- Reducer 4 (BROADCAST_EDGE) Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 3 (SIMPLE_EDGE) + Reducer 4 <- Map 3 (CUSTOM_SIMPLE_EDGE) #### A masked pattern was here #### Vertices: Map 1 Map Operator Tree: TableScan alias: a - filterExpr: key is not null (type: boolean) + filterExpr: (key is not null and key BETWEEN DynamicValue(RS_7_b_key_min) AND DynamicValue(RS_7_b_key_max) and in_bloom_filter(key, DynamicValue(RS_7_b_key_bloom_filter))) (type: boolean) Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE Filter Operator - predicate: key is not null (type: boolean) + predicate: (key is not null and key BETWEEN DynamicValue(RS_7_b_key_min) AND DynamicValue(RS_7_b_key_max) and in_bloom_filter(key, DynamicValue(RS_7_b_key_bloom_filter))) (type: boolean) Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: key (type: string), value (type: string) @@ -56,6 +58,19 @@ STAGE PLANS: Map-reduce partition columns: _col0 (type: string) Statistics: Num rows: 25 Data size: 4375 Basic stats: COMPLETE Column stats: COMPLETE value expressions: _col1 (type: string) + Select Operator + expressions: _col0 (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 25 Data size: 4375 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: min(_col0), max(_col0), bloom_filter(_col0, expectedEntries=25) + mode: hash + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: string), _col1 (type: string), _col2 (type: binary) Execution mode: llap LLAP IO: no inputs Reducer 2 @@ -76,6 +91,18 @@ STAGE PLANS: input format: org.apache.hadoop.mapred.SequenceFileInputFormat output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Reducer 4 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: min(VALUE._col0), max(VALUE._col1), bloom_filter(VALUE._col2, expectedEntries=25) + mode: final + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: string), _col1 (type: string), _col2 (type: binary) Stage: Stage-0 Fetch Operator @@ -257,8 +284,10 @@ STAGE PLANS: Tez #### A masked pattern was here #### Edges: - Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 4 (SIMPLE_EDGE) + Map 5 <- Reducer 4 (BROADCAST_EDGE) + Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 5 (SIMPLE_EDGE) Reducer 3 <- Reducer 2 (CUSTOM_SIMPLE_EDGE) + Reducer 4 <- Map 1 (CUSTOM_SIMPLE_EDGE) #### A masked pattern was here #### Vertices: Map 1 @@ -279,16 +308,29 @@ STAGE PLANS: sort order: + Map-reduce partition columns: _col0 (type: int) Statistics: Num rows: 242 Data size: 24684 Basic stats: COMPLETE Column stats: NONE - Execution mode: vectorized, llap + Select Operator + expressions: _col0 (type: int) + outputColumnNames: _col0 + Statistics: Num rows: 242 Data size: 24684 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: min(_col0), max(_col0), bloom_filter(_col0, expectedEntries=242) + mode: hash + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: int), _col1 (type: int), _col2 (type: binary) + Execution mode: llap LLAP IO: all inputs - Map 4 + Map 5 Map Operator Tree: TableScan alias: b - filterExpr: key is not null (type: boolean) + filterExpr: (key is not null and key BETWEEN DynamicValue(RS_6_a_key_min) AND DynamicValue(RS_6_a_key_max) and in_bloom_filter(key, DynamicValue(RS_6_a_key_bloom_filter))) (type: boolean) Statistics: Num rows: 500 Data size: 51000 Basic stats: COMPLETE Column stats: NONE Filter Operator - predicate: key is not null (type: boolean) + predicate: (key is not null and key BETWEEN DynamicValue(RS_6_a_key_min) AND DynamicValue(RS_6_a_key_max) and in_bloom_filter(key, DynamicValue(RS_6_a_key_bloom_filter))) (type: boolean) Statistics: Num rows: 500 Data size: 51000 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: key (type: int) @@ -299,7 +341,7 @@ STAGE PLANS: sort order: + Map-reduce partition columns: _col0 (type: int) Statistics: Num rows: 500 Data size: 51000 Basic stats: COMPLETE Column stats: NONE - Execution mode: vectorized, llap + Execution mode: llap LLAP IO: all inputs Reducer 2 Execution mode: llap @@ -335,6 +377,18 @@ STAGE PLANS: input format: org.apache.hadoop.mapred.SequenceFileInputFormat output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Reducer 4 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: min(VALUE._col0), max(VALUE._col1), bloom_filter(VALUE._col2, expectedEntries=242) + mode: final + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: int), _col1 (type: int), _col2 (type: binary) Stage: Stage-0 Fetch Operator @@ -1347,8 +1401,10 @@ STAGE PLANS: Tez #### A masked pattern was here #### Edges: - Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 4 (SIMPLE_EDGE) + Map 5 <- Reducer 4 (BROADCAST_EDGE) + Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 5 (SIMPLE_EDGE) Reducer 3 <- Reducer 2 (CUSTOM_SIMPLE_EDGE) + Reducer 4 <- Map 1 (CUSTOM_SIMPLE_EDGE) #### A masked pattern was here #### Vertices: Map 1 @@ -1365,23 +1421,39 @@ STAGE PLANS: sort order: + Map-reduce partition columns: _col0 (type: int) Statistics: Num rows: 242 Data size: 24684 Basic stats: COMPLETE Column stats: NONE - Execution mode: vectorized, llap + Select Operator + expressions: _col0 (type: int) + outputColumnNames: _col0 + Statistics: Num rows: 242 Data size: 24684 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: min(_col0), max(_col0), bloom_filter(_col0, expectedEntries=242) + mode: hash + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: int), _col1 (type: int), _col2 (type: binary) + Execution mode: llap LLAP IO: all inputs - Map 4 + Map 5 Map Operator Tree: TableScan alias: b Statistics: Num rows: 500 Data size: 51000 Basic stats: COMPLETE Column stats: NONE - Select Operator - expressions: key (type: int) - outputColumnNames: _col0 + Filter Operator + predicate: key BETWEEN DynamicValue(RS_4_a_key_min) AND DynamicValue(RS_4_a_key_max) (type: boolean) Statistics: Num rows: 500 Data size: 51000 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: _col0 (type: int) - sort order: + - Map-reduce partition columns: _col0 (type: int) + Select Operator + expressions: key (type: int) + outputColumnNames: _col0 Statistics: Num rows: 500 Data size: 51000 Basic stats: COMPLETE Column stats: NONE - Execution mode: vectorized, llap + Reduce Output Operator + key expressions: _col0 (type: int) + sort order: + + Map-reduce partition columns: _col0 (type: int) + Statistics: Num rows: 500 Data size: 51000 Basic stats: COMPLETE Column stats: NONE + Execution mode: llap LLAP IO: all inputs Reducer 2 Execution mode: llap @@ -1417,6 +1489,18 @@ STAGE PLANS: input format: org.apache.hadoop.mapred.SequenceFileInputFormat output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Reducer 4 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: min(VALUE._col0), max(VALUE._col1), bloom_filter(VALUE._col2, expectedEntries=242) + mode: final + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: int), _col1 (type: int), _col2 (type: binary) Stage: Stage-0 Fetch Operator @@ -1458,8 +1542,10 @@ STAGE PLANS: Tez #### A masked pattern was here #### Edges: + Map 1 <- Reducer 5 (BROADCAST_EDGE) Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 4 (SIMPLE_EDGE) Reducer 3 <- Reducer 2 (CUSTOM_SIMPLE_EDGE) + Reducer 5 <- Map 4 (CUSTOM_SIMPLE_EDGE) #### A masked pattern was here #### Vertices: Map 1 @@ -1467,16 +1553,19 @@ STAGE PLANS: TableScan alias: a Statistics: Num rows: 242 Data size: 24684 Basic stats: COMPLETE Column stats: NONE - Select Operator - expressions: key (type: int) - outputColumnNames: _col0 + Filter Operator + predicate: key BETWEEN DynamicValue(RS_5_b_key_min) AND DynamicValue(RS_5_b_key_max) (type: boolean) Statistics: Num rows: 242 Data size: 24684 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - key expressions: _col0 (type: int) - sort order: + - Map-reduce partition columns: _col0 (type: int) + Select Operator + expressions: key (type: int) + outputColumnNames: _col0 Statistics: Num rows: 242 Data size: 24684 Basic stats: COMPLETE Column stats: NONE - Execution mode: vectorized, llap + Reduce Output Operator + key expressions: _col0 (type: int) + sort order: + + Map-reduce partition columns: _col0 (type: int) + Statistics: Num rows: 242 Data size: 24684 Basic stats: COMPLETE Column stats: NONE + Execution mode: llap LLAP IO: all inputs Map 4 Map Operator Tree: @@ -1492,7 +1581,20 @@ STAGE PLANS: sort order: + Map-reduce partition columns: _col0 (type: int) Statistics: Num rows: 500 Data size: 51000 Basic stats: COMPLETE Column stats: NONE - Execution mode: vectorized, llap + Select Operator + expressions: _col0 (type: int) + outputColumnNames: _col0 + Statistics: Num rows: 500 Data size: 51000 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: min(_col0), max(_col0), bloom_filter(_col0, expectedEntries=500) + mode: hash + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: int), _col1 (type: int), _col2 (type: binary) + Execution mode: llap LLAP IO: all inputs Reducer 2 Execution mode: llap @@ -1528,6 +1630,18 @@ STAGE PLANS: input format: org.apache.hadoop.mapred.SequenceFileInputFormat output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Reducer 5 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: min(VALUE._col0), max(VALUE._col1), bloom_filter(VALUE._col2, expectedEntries=500) + mode: final + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: int), _col1 (type: int), _col2 (type: binary) Stage: Stage-0 Fetch Operator @@ -1676,19 +1790,23 @@ STAGE PLANS: Tez #### A masked pattern was here #### Edges: - Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 5 (SIMPLE_EDGE) - Reducer 3 <- Map 6 (SIMPLE_EDGE), Reducer 2 (SIMPLE_EDGE) + Map 1 <- Reducer 8 (BROADCAST_EDGE) + Map 6 <- Reducer 5 (BROADCAST_EDGE) + Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 6 (SIMPLE_EDGE) + Reducer 3 <- Map 7 (SIMPLE_EDGE), Reducer 2 (SIMPLE_EDGE) Reducer 4 <- Reducer 3 (CUSTOM_SIMPLE_EDGE) + Reducer 5 <- Map 1 (CUSTOM_SIMPLE_EDGE) + Reducer 8 <- Map 7 (CUSTOM_SIMPLE_EDGE) #### A masked pattern was here #### Vertices: Map 1 Map Operator Tree: TableScan alias: a - filterExpr: (key is not null and value is not null) (type: boolean) + filterExpr: (key is not null and value is not null and value BETWEEN DynamicValue(RS_13_c_value_min) AND DynamicValue(RS_13_c_value_max) and in_bloom_filter(value, DynamicValue(RS_13_c_value_bloom_filter))) (type: boolean) Statistics: Num rows: 242 Data size: 24684 Basic stats: COMPLETE Column stats: NONE Filter Operator - predicate: (key is not null and value is not null) (type: boolean) + predicate: (key is not null and value is not null and value BETWEEN DynamicValue(RS_13_c_value_min) AND DynamicValue(RS_13_c_value_max) and in_bloom_filter(value, DynamicValue(RS_13_c_value_bloom_filter))) (type: boolean) Statistics: Num rows: 242 Data size: 24684 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: key (type: int), value (type: string) @@ -1700,16 +1818,29 @@ STAGE PLANS: Map-reduce partition columns: _col0 (type: int) Statistics: Num rows: 242 Data size: 24684 Basic stats: COMPLETE Column stats: NONE value expressions: _col1 (type: string) - Execution mode: vectorized, llap + Select Operator + expressions: _col0 (type: int) + outputColumnNames: _col0 + Statistics: Num rows: 242 Data size: 24684 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: min(_col0), max(_col0), bloom_filter(_col0, expectedEntries=242) + mode: hash + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: int), _col1 (type: int), _col2 (type: binary) + Execution mode: llap LLAP IO: all inputs - Map 5 + Map 6 Map Operator Tree: TableScan alias: b - filterExpr: key is not null (type: boolean) + filterExpr: (key is not null and key BETWEEN DynamicValue(RS_9_a_key_min) AND DynamicValue(RS_9_a_key_max) and in_bloom_filter(key, DynamicValue(RS_9_a_key_bloom_filter))) (type: boolean) Statistics: Num rows: 500 Data size: 51000 Basic stats: COMPLETE Column stats: NONE Filter Operator - predicate: key is not null (type: boolean) + predicate: (key is not null and key BETWEEN DynamicValue(RS_9_a_key_min) AND DynamicValue(RS_9_a_key_max) and in_bloom_filter(key, DynamicValue(RS_9_a_key_bloom_filter))) (type: boolean) Statistics: Num rows: 500 Data size: 51000 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: key (type: int) @@ -1720,9 +1851,9 @@ STAGE PLANS: sort order: + Map-reduce partition columns: _col0 (type: int) Statistics: Num rows: 500 Data size: 51000 Basic stats: COMPLETE Column stats: NONE - Execution mode: vectorized, llap + Execution mode: llap LLAP IO: all inputs - Map 6 + Map 7 Map Operator Tree: TableScan alias: c @@ -1740,6 +1871,19 @@ STAGE PLANS: sort order: + Map-reduce partition columns: _col0 (type: string) Statistics: Num rows: 25 Data size: 2225 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: _col0 (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 25 Data size: 2225 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: min(_col0), max(_col0), bloom_filter(_col0, expectedEntries=25) + mode: hash + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: string), _col1 (type: string), _col2 (type: binary) Execution mode: llap LLAP IO: no inputs Reducer 2 @@ -1792,6 +1936,30 @@ STAGE PLANS: input format: org.apache.hadoop.mapred.SequenceFileInputFormat output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Reducer 5 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: min(VALUE._col0), max(VALUE._col1), bloom_filter(VALUE._col2, expectedEntries=242) + mode: final + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: int), _col1 (type: int), _col2 (type: binary) + Reducer 8 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: min(VALUE._col0), max(VALUE._col1), bloom_filter(VALUE._col2, expectedEntries=25) + mode: final + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: string), _col1 (type: string), _col2 (type: binary) Stage: Stage-0 Fetch Operator @@ -1829,8 +1997,10 @@ STAGE PLANS: Tez #### A masked pattern was here #### Edges: - Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 4 (SIMPLE_EDGE) + Map 5 <- Reducer 4 (BROADCAST_EDGE) + Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 5 (SIMPLE_EDGE) Reducer 3 <- Reducer 2 (CUSTOM_SIMPLE_EDGE) + Reducer 4 <- Map 1 (CUSTOM_SIMPLE_EDGE) #### A masked pattern was here #### Vertices: Map 1 @@ -1851,16 +2021,29 @@ STAGE PLANS: sort order: + Map-reduce partition columns: _col1 (type: string) Statistics: Num rows: 242 Data size: 24684 Basic stats: COMPLETE Column stats: NONE - Execution mode: vectorized, llap + Select Operator + expressions: _col1 (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 242 Data size: 24684 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: min(_col0), max(_col0), bloom_filter(_col0, expectedEntries=242) + mode: hash + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: string), _col1 (type: string), _col2 (type: binary) + Execution mode: llap LLAP IO: all inputs - Map 4 + Map 5 Map Operator Tree: TableScan alias: b - filterExpr: value is not null (type: boolean) + filterExpr: (value is not null and value BETWEEN DynamicValue(RS_6_a_value_min) AND DynamicValue(RS_6_a_value_max) and in_bloom_filter(value, DynamicValue(RS_6_a_value_bloom_filter))) (type: boolean) Statistics: Num rows: 500 Data size: 51000 Basic stats: COMPLETE Column stats: NONE Filter Operator - predicate: value is not null (type: boolean) + predicate: (value is not null and value BETWEEN DynamicValue(RS_6_a_value_min) AND DynamicValue(RS_6_a_value_max) and in_bloom_filter(value, DynamicValue(RS_6_a_value_bloom_filter))) (type: boolean) Statistics: Num rows: 500 Data size: 51000 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: value (type: string) @@ -1871,7 +2054,7 @@ STAGE PLANS: sort order: + Map-reduce partition columns: _col1 (type: string) Statistics: Num rows: 500 Data size: 51000 Basic stats: COMPLETE Column stats: NONE - Execution mode: vectorized, llap + Execution mode: llap LLAP IO: all inputs Reducer 2 Execution mode: llap @@ -1907,6 +2090,18 @@ STAGE PLANS: input format: org.apache.hadoop.mapred.SequenceFileInputFormat output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Reducer 4 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: min(VALUE._col0), max(VALUE._col1), bloom_filter(VALUE._col2, expectedEntries=242) + mode: final + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: string), _col1 (type: string), _col2 (type: binary) Stage: Stage-0 Fetch Operator @@ -1950,10 +2145,12 @@ STAGE PLANS: Tez #### A masked pattern was here #### Edges: - Map 7 <- Union 3 (CONTAINS) - Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 6 (SIMPLE_EDGE), Union 3 (CONTAINS) - Reducer 4 <- Map 8 (SIMPLE_EDGE), Union 3 (SIMPLE_EDGE) + Map 8 <- Union 3 (CONTAINS) + Map 9 <- Reducer 6 (BROADCAST_EDGE) + Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 7 (SIMPLE_EDGE), Union 3 (CONTAINS) + Reducer 4 <- Map 9 (SIMPLE_EDGE), Union 3 (SIMPLE_EDGE) Reducer 5 <- Reducer 4 (CUSTOM_SIMPLE_EDGE) + Reducer 6 <- Union 3 (CUSTOM_SIMPLE_EDGE) #### A masked pattern was here #### Vertices: Map 1 @@ -1976,7 +2173,7 @@ STAGE PLANS: Statistics: Num rows: 242 Data size: 24684 Basic stats: COMPLETE Column stats: NONE Execution mode: vectorized, llap LLAP IO: all inputs - Map 6 + Map 7 Map Operator Tree: TableScan alias: s3 @@ -1996,7 +2193,7 @@ STAGE PLANS: Statistics: Num rows: 242 Data size: 24684 Basic stats: COMPLETE Column stats: NONE Execution mode: vectorized, llap LLAP IO: all inputs - Map 7 + Map 8 Map Operator Tree: TableScan alias: s2 @@ -2014,16 +2211,29 @@ STAGE PLANS: sort order: + Map-reduce partition columns: _col0 (type: int) Statistics: Num rows: 508 Data size: 51836 Basic stats: COMPLETE Column stats: NONE - Execution mode: vectorized, llap + Select Operator + expressions: _col0 (type: int) + outputColumnNames: _col0 + Statistics: Num rows: 508 Data size: 51836 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: min(_col0), max(_col0), bloom_filter(_col0, expectedEntries=508) + mode: hash + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: int), _col1 (type: int), _col2 (type: binary) + Execution mode: llap LLAP IO: all inputs - Map 8 + Map 9 Map Operator Tree: TableScan alias: b - filterExpr: key is not null (type: boolean) + filterExpr: (key is not null and key BETWEEN DynamicValue(RS_18_s1_key_min) AND DynamicValue(RS_18_s1_key_max) and in_bloom_filter(key, DynamicValue(RS_18_s1_key_bloom_filter))) (type: boolean) Statistics: Num rows: 500 Data size: 51000 Basic stats: COMPLETE Column stats: NONE Filter Operator - predicate: key is not null (type: boolean) + predicate: (key is not null and key BETWEEN DynamicValue(RS_18_s1_key_min) AND DynamicValue(RS_18_s1_key_max) and in_bloom_filter(key, DynamicValue(RS_18_s1_key_bloom_filter))) (type: boolean) Statistics: Num rows: 500 Data size: 51000 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: key (type: int) @@ -2034,7 +2244,7 @@ STAGE PLANS: sort order: + Map-reduce partition columns: _col0 (type: int) Statistics: Num rows: 500 Data size: 51000 Basic stats: COMPLETE Column stats: NONE - Execution mode: vectorized, llap + Execution mode: llap LLAP IO: all inputs Reducer 2 Execution mode: llap @@ -2052,6 +2262,19 @@ STAGE PLANS: sort order: + Map-reduce partition columns: _col0 (type: int) Statistics: Num rows: 508 Data size: 51836 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: int) + outputColumnNames: _col0 + Statistics: Num rows: 508 Data size: 51836 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: min(_col0), max(_col0), bloom_filter(_col0, expectedEntries=508) + mode: hash + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: int), _col1 (type: int), _col2 (type: binary) Reducer 4 Execution mode: llap Reduce Operator Tree: @@ -2086,6 +2309,18 @@ STAGE PLANS: input format: org.apache.hadoop.mapred.SequenceFileInputFormat output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Reducer 6 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: min(VALUE._col0), max(VALUE._col1), bloom_filter(VALUE._col2, expectedEntries=508) + mode: final + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: int), _col1 (type: int), _col2 (type: binary) Union 3 Vertex: Union 3 @@ -2108,8 +2343,10 @@ STAGE PLANS: Tez #### A masked pattern was here #### Edges: - Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 4 (SIMPLE_EDGE) + Map 5 <- Reducer 4 (BROADCAST_EDGE) + Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 5 (SIMPLE_EDGE) Reducer 3 <- Reducer 2 (CUSTOM_SIMPLE_EDGE) + Reducer 4 <- Map 1 (CUSTOM_SIMPLE_EDGE) #### A masked pattern was here #### Vertices: Map 1 @@ -2130,16 +2367,29 @@ STAGE PLANS: sort order: + Map-reduce partition columns: _col1 (type: string) Statistics: Num rows: 242 Data size: 24684 Basic stats: COMPLETE Column stats: NONE - Execution mode: vectorized, llap + Select Operator + expressions: _col1 (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 242 Data size: 24684 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: min(_col0), max(_col0), bloom_filter(_col0, expectedEntries=242) + mode: hash + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: string), _col1 (type: string), _col2 (type: binary) + Execution mode: llap LLAP IO: all inputs - Map 4 + Map 5 Map Operator Tree: TableScan alias: b - filterExpr: value is not null (type: boolean) + filterExpr: (value is not null and value BETWEEN DynamicValue(RS_6_a_value_min) AND DynamicValue(RS_6_a_value_max) and in_bloom_filter(value, DynamicValue(RS_6_a_value_bloom_filter))) (type: boolean) Statistics: Num rows: 500 Data size: 51000 Basic stats: COMPLETE Column stats: NONE Filter Operator - predicate: value is not null (type: boolean) + predicate: (value is not null and value BETWEEN DynamicValue(RS_6_a_value_min) AND DynamicValue(RS_6_a_value_max) and in_bloom_filter(value, DynamicValue(RS_6_a_value_bloom_filter))) (type: boolean) Statistics: Num rows: 500 Data size: 51000 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: value (type: string) @@ -2150,7 +2400,7 @@ STAGE PLANS: sort order: + Map-reduce partition columns: _col1 (type: string) Statistics: Num rows: 500 Data size: 51000 Basic stats: COMPLETE Column stats: NONE - Execution mode: vectorized, llap + Execution mode: llap LLAP IO: all inputs Reducer 2 Execution mode: llap @@ -2186,6 +2436,18 @@ STAGE PLANS: input format: org.apache.hadoop.mapred.SequenceFileInputFormat output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Reducer 4 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: min(VALUE._col0), max(VALUE._col1), bloom_filter(VALUE._col2, expectedEntries=242) + mode: final + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: string), _col1 (type: string), _col2 (type: binary) Stage: Stage-0 Fetch Operator @@ -2221,19 +2483,23 @@ STAGE PLANS: Tez #### A masked pattern was here #### Edges: - Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 5 (SIMPLE_EDGE) - Reducer 3 <- Map 6 (SIMPLE_EDGE), Reducer 2 (SIMPLE_EDGE) + Map 1 <- Reducer 8 (BROADCAST_EDGE) + Map 6 <- Reducer 5 (BROADCAST_EDGE) + Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 6 (SIMPLE_EDGE) + Reducer 3 <- Map 7 (SIMPLE_EDGE), Reducer 2 (SIMPLE_EDGE) Reducer 4 <- Reducer 3 (CUSTOM_SIMPLE_EDGE) + Reducer 5 <- Map 1 (CUSTOM_SIMPLE_EDGE) + Reducer 8 <- Map 7 (CUSTOM_SIMPLE_EDGE) #### A masked pattern was here #### Vertices: Map 1 Map Operator Tree: TableScan alias: a - filterExpr: (key is not null and value is not null) (type: boolean) + filterExpr: (key is not null and value is not null and value BETWEEN DynamicValue(RS_13_c_value_min) AND DynamicValue(RS_13_c_value_max) and in_bloom_filter(value, DynamicValue(RS_13_c_value_bloom_filter))) (type: boolean) Statistics: Num rows: 242 Data size: 24684 Basic stats: COMPLETE Column stats: NONE Filter Operator - predicate: (key is not null and value is not null) (type: boolean) + predicate: (key is not null and value is not null and value BETWEEN DynamicValue(RS_13_c_value_min) AND DynamicValue(RS_13_c_value_max) and in_bloom_filter(value, DynamicValue(RS_13_c_value_bloom_filter))) (type: boolean) Statistics: Num rows: 242 Data size: 24684 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: key (type: int), value (type: string) @@ -2245,16 +2511,29 @@ STAGE PLANS: Map-reduce partition columns: _col0 (type: int) Statistics: Num rows: 242 Data size: 24684 Basic stats: COMPLETE Column stats: NONE value expressions: _col1 (type: string) - Execution mode: vectorized, llap + Select Operator + expressions: _col0 (type: int) + outputColumnNames: _col0 + Statistics: Num rows: 242 Data size: 24684 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: min(_col0), max(_col0), bloom_filter(_col0, expectedEntries=242) + mode: hash + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: int), _col1 (type: int), _col2 (type: binary) + Execution mode: llap LLAP IO: all inputs - Map 5 + Map 6 Map Operator Tree: TableScan alias: b - filterExpr: key is not null (type: boolean) + filterExpr: (key is not null and key BETWEEN DynamicValue(RS_9_a_key_min) AND DynamicValue(RS_9_a_key_max) and in_bloom_filter(key, DynamicValue(RS_9_a_key_bloom_filter))) (type: boolean) Statistics: Num rows: 500 Data size: 51000 Basic stats: COMPLETE Column stats: NONE Filter Operator - predicate: key is not null (type: boolean) + predicate: (key is not null and key BETWEEN DynamicValue(RS_9_a_key_min) AND DynamicValue(RS_9_a_key_max) and in_bloom_filter(key, DynamicValue(RS_9_a_key_bloom_filter))) (type: boolean) Statistics: Num rows: 500 Data size: 51000 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: key (type: int) @@ -2265,9 +2544,9 @@ STAGE PLANS: sort order: + Map-reduce partition columns: _col0 (type: int) Statistics: Num rows: 500 Data size: 51000 Basic stats: COMPLETE Column stats: NONE - Execution mode: vectorized, llap + Execution mode: llap LLAP IO: all inputs - Map 6 + Map 7 Map Operator Tree: TableScan alias: c @@ -2285,6 +2564,19 @@ STAGE PLANS: sort order: + Map-reduce partition columns: _col0 (type: string) Statistics: Num rows: 25 Data size: 2225 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: _col0 (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 25 Data size: 2225 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: min(_col0), max(_col0), bloom_filter(_col0, expectedEntries=25) + mode: hash + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: string), _col1 (type: string), _col2 (type: binary) Execution mode: llap LLAP IO: no inputs Reducer 2 @@ -2337,6 +2629,30 @@ STAGE PLANS: input format: org.apache.hadoop.mapred.SequenceFileInputFormat output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Reducer 5 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: min(VALUE._col0), max(VALUE._col1), bloom_filter(VALUE._col2, expectedEntries=242) + mode: final + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: int), _col1 (type: int), _col2 (type: binary) + Reducer 8 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: min(VALUE._col0), max(VALUE._col1), bloom_filter(VALUE._col2, expectedEntries=25) + mode: final + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: string), _col1 (type: string), _col2 (type: binary) Stage: Stage-0 Fetch Operator @@ -2382,10 +2698,12 @@ STAGE PLANS: Tez #### A masked pattern was here #### Edges: - Map 7 <- Union 3 (CONTAINS) - Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 6 (SIMPLE_EDGE), Union 3 (CONTAINS) - Reducer 4 <- Map 8 (SIMPLE_EDGE), Union 3 (SIMPLE_EDGE) + Map 8 <- Union 3 (CONTAINS) + Map 9 <- Reducer 6 (BROADCAST_EDGE) + Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 7 (SIMPLE_EDGE), Union 3 (CONTAINS) + Reducer 4 <- Map 9 (SIMPLE_EDGE), Union 3 (SIMPLE_EDGE) Reducer 5 <- Reducer 4 (CUSTOM_SIMPLE_EDGE) + Reducer 6 <- Union 3 (CUSTOM_SIMPLE_EDGE) #### A masked pattern was here #### Vertices: Map 1 @@ -2408,7 +2726,7 @@ STAGE PLANS: Statistics: Num rows: 242 Data size: 24684 Basic stats: COMPLETE Column stats: NONE Execution mode: vectorized, llap LLAP IO: all inputs - Map 6 + Map 7 Map Operator Tree: TableScan alias: s3 @@ -2428,7 +2746,7 @@ STAGE PLANS: Statistics: Num rows: 242 Data size: 24684 Basic stats: COMPLETE Column stats: NONE Execution mode: vectorized, llap LLAP IO: all inputs - Map 7 + Map 8 Map Operator Tree: TableScan alias: s2 @@ -2446,16 +2764,29 @@ STAGE PLANS: sort order: + Map-reduce partition columns: _col0 (type: int) Statistics: Num rows: 508 Data size: 51836 Basic stats: COMPLETE Column stats: NONE - Execution mode: vectorized, llap + Select Operator + expressions: _col0 (type: int) + outputColumnNames: _col0 + Statistics: Num rows: 508 Data size: 51836 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: min(_col0), max(_col0), bloom_filter(_col0, expectedEntries=508) + mode: hash + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: int), _col1 (type: int), _col2 (type: binary) + Execution mode: llap LLAP IO: all inputs - Map 8 + Map 9 Map Operator Tree: TableScan alias: b - filterExpr: key is not null (type: boolean) + filterExpr: (key is not null and key BETWEEN DynamicValue(RS_18_s1_key_min) AND DynamicValue(RS_18_s1_key_max) and in_bloom_filter(key, DynamicValue(RS_18_s1_key_bloom_filter))) (type: boolean) Statistics: Num rows: 500 Data size: 51000 Basic stats: COMPLETE Column stats: NONE Filter Operator - predicate: key is not null (type: boolean) + predicate: (key is not null and key BETWEEN DynamicValue(RS_18_s1_key_min) AND DynamicValue(RS_18_s1_key_max) and in_bloom_filter(key, DynamicValue(RS_18_s1_key_bloom_filter))) (type: boolean) Statistics: Num rows: 500 Data size: 51000 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: key (type: int) @@ -2466,7 +2797,7 @@ STAGE PLANS: sort order: + Map-reduce partition columns: _col0 (type: int) Statistics: Num rows: 500 Data size: 51000 Basic stats: COMPLETE Column stats: NONE - Execution mode: vectorized, llap + Execution mode: llap LLAP IO: all inputs Reducer 2 Execution mode: llap @@ -2484,6 +2815,19 @@ STAGE PLANS: sort order: + Map-reduce partition columns: _col0 (type: int) Statistics: Num rows: 508 Data size: 51836 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: int) + outputColumnNames: _col0 + Statistics: Num rows: 508 Data size: 51836 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: min(_col0), max(_col0), bloom_filter(_col0, expectedEntries=508) + mode: hash + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: int), _col1 (type: int), _col2 (type: binary) Reducer 4 Execution mode: llap Reduce Operator Tree: @@ -2518,6 +2862,18 @@ STAGE PLANS: input format: org.apache.hadoop.mapred.SequenceFileInputFormat output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Reducer 6 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: min(VALUE._col0), max(VALUE._col1), bloom_filter(VALUE._col2, expectedEntries=508) + mode: final + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: int), _col1 (type: int), _col2 (type: binary) Union 3 Vertex: Union 3 @@ -2554,10 +2910,12 @@ STAGE PLANS: Tez #### A masked pattern was here #### Edges: + Map 6 <- Reducer 5 (BROADCAST_EDGE) Reducer 2 <- Map 1 (SIMPLE_EDGE) - Reducer 3 <- Reducer 2 (SIMPLE_EDGE), Reducer 6 (SIMPLE_EDGE) + Reducer 3 <- Reducer 2 (SIMPLE_EDGE), Reducer 7 (SIMPLE_EDGE) Reducer 4 <- Reducer 3 (CUSTOM_SIMPLE_EDGE) - Reducer 6 <- Map 5 (SIMPLE_EDGE) + Reducer 5 <- Reducer 2 (CUSTOM_SIMPLE_EDGE) + Reducer 7 <- Map 6 (SIMPLE_EDGE) #### A masked pattern was here #### Vertices: Map 1 @@ -2579,14 +2937,14 @@ STAGE PLANS: Statistics: Num rows: 242 Data size: 24684 Basic stats: COMPLETE Column stats: NONE Execution mode: vectorized, llap LLAP IO: all inputs - Map 5 + Map 6 Map Operator Tree: TableScan alias: t2 - filterExpr: key is not null (type: boolean) + filterExpr: (key is not null and key BETWEEN DynamicValue(RS_12_t1_key_min) AND DynamicValue(RS_12_t1_key_max) and in_bloom_filter(key, DynamicValue(RS_12_t1_key_bloom_filter))) (type: boolean) Statistics: Num rows: 500 Data size: 51000 Basic stats: COMPLETE Column stats: NONE Filter Operator - predicate: key is not null (type: boolean) + predicate: (key is not null and key BETWEEN DynamicValue(RS_12_t1_key_min) AND DynamicValue(RS_12_t1_key_max) and in_bloom_filter(key, DynamicValue(RS_12_t1_key_bloom_filter))) (type: boolean) Statistics: Num rows: 500 Data size: 51000 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: key (type: int), value (type: string) @@ -2596,10 +2954,10 @@ STAGE PLANS: key expressions: _col0 (type: int), _col1 (type: string) sort order: ++ Statistics: Num rows: 500 Data size: 51000 Basic stats: COMPLETE Column stats: NONE - Execution mode: vectorized, llap + Execution mode: llap LLAP IO: all inputs Reducer 2 - Execution mode: vectorized, llap + Execution mode: llap Reduce Operator Tree: Select Operator expressions: KEY.reducesinkkey0 (type: int) @@ -2610,6 +2968,19 @@ STAGE PLANS: sort order: + Map-reduce partition columns: _col0 (type: int) Statistics: Num rows: 242 Data size: 24684 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: int) + outputColumnNames: _col0 + Statistics: Num rows: 242 Data size: 24684 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: min(_col0), max(_col0), bloom_filter(_col0, expectedEntries=242) + mode: hash + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: int), _col1 (type: int), _col2 (type: binary) Reducer 3 Execution mode: llap Reduce Operator Tree: @@ -2644,7 +3015,19 @@ STAGE PLANS: input format: org.apache.hadoop.mapred.SequenceFileInputFormat output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - Reducer 6 + Reducer 5 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: min(VALUE._col0), max(VALUE._col1), bloom_filter(VALUE._col2, expectedEntries=242) + mode: final + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: int), _col1 (type: int), _col2 (type: binary) + Reducer 7 Execution mode: vectorized, llap Reduce Operator Tree: Select Operator @@ -3820,7 +4203,7 @@ NULL NULL NULL NULL NULL NULL 97 val_97 2008-04-08 NULL NULL NULL NULL NULL NULL 97 val_97 2008-04-08 NULL NULL NULL NULL NULL NULL 98 val_98 2008-04-08 NULL NULL NULL NULL NULL NULL 98 val_98 2008-04-08 -Warning: Shuffle Join MERGEJOIN[25][tables = [a, b]] in Stage 'Reducer 2' is a cross product +Warning: Shuffle Join MERGEJOIN[30][tables = [a, b]] in Stage 'Reducer 2' is a cross product PREHOOK: query: select * from (select * from tab where tab.key = 0)a full outer join diff --git a/ql/src/test/results/clientpositive/llap/orc_llap.q.out b/ql/src/test/results/clientpositive/llap/orc_llap.q.out index 6b6706e..90055a5 100644 --- a/ql/src/test/results/clientpositive/llap/orc_llap.q.out +++ b/ql/src/test/results/clientpositive/llap/orc_llap.q.out @@ -559,8 +559,10 @@ STAGE PLANS: Tez #### A masked pattern was here #### Edges: - Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 4 (SIMPLE_EDGE) + Map 5 <- Reducer 4 (BROADCAST_EDGE) + Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 5 (SIMPLE_EDGE) Reducer 3 <- Reducer 2 (CUSTOM_SIMPLE_EDGE) + Reducer 4 <- Map 1 (CUSTOM_SIMPLE_EDGE) #### A masked pattern was here #### Vertices: Map 1 @@ -582,16 +584,29 @@ STAGE PLANS: Map-reduce partition columns: _col0 (type: smallint) Statistics: Num rows: 122880 Data size: 29079940 Basic stats: COMPLETE Column stats: NONE value expressions: _col2 (type: string) - Execution mode: vectorized, llap + Select Operator + expressions: _col0 (type: smallint) + outputColumnNames: _col0 + Statistics: Num rows: 122880 Data size: 29079940 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: min(_col0), max(_col0), bloom_filter(_col0, expectedEntries=122880) + mode: hash + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: smallint), _col1 (type: smallint), _col2 (type: binary) + Execution mode: llap LLAP IO: all inputs - Map 4 + Map 5 Map Operator Tree: TableScan alias: o2 - filterExpr: (csmallint is not null and cbigint is not null) (type: boolean) + filterExpr: (csmallint is not null and cbigint is not null and csmallint BETWEEN DynamicValue(RS_6_o1_csmallint_min) AND DynamicValue(RS_6_o1_csmallint_max) and in_bloom_filter(csmallint, DynamicValue(RS_6_o1_csmallint_bloom_filter))) (type: boolean) Statistics: Num rows: 122880 Data size: 29079940 Basic stats: COMPLETE Column stats: NONE Filter Operator - predicate: (csmallint is not null and cbigint is not null) (type: boolean) + predicate: (csmallint is not null and cbigint is not null and csmallint BETWEEN DynamicValue(RS_6_o1_csmallint_min) AND DynamicValue(RS_6_o1_csmallint_max) and in_bloom_filter(csmallint, DynamicValue(RS_6_o1_csmallint_bloom_filter))) (type: boolean) Statistics: Num rows: 122880 Data size: 29079940 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: csmallint (type: smallint), cstring2 (type: string) @@ -603,7 +618,7 @@ STAGE PLANS: Map-reduce partition columns: _col0 (type: smallint) Statistics: Num rows: 122880 Data size: 29079940 Basic stats: COMPLETE Column stats: NONE value expressions: _col2 (type: string) - Execution mode: vectorized, llap + Execution mode: llap LLAP IO: all inputs Reducer 2 Execution mode: llap @@ -644,6 +659,18 @@ STAGE PLANS: input format: org.apache.hadoop.mapred.SequenceFileInputFormat output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Reducer 4 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: min(VALUE._col0), max(VALUE._col1), bloom_filter(VALUE._col2, expectedEntries=122880) + mode: final + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: smallint), _col1 (type: smallint), _col2 (type: binary) Stage: Stage-0 Fetch Operator @@ -1024,8 +1051,10 @@ STAGE PLANS: Tez #### A masked pattern was here #### Edges: - Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 4 (SIMPLE_EDGE) + Map 5 <- Reducer 4 (BROADCAST_EDGE) + Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 5 (SIMPLE_EDGE) Reducer 3 <- Reducer 2 (CUSTOM_SIMPLE_EDGE) + Reducer 4 <- Map 1 (CUSTOM_SIMPLE_EDGE) #### A masked pattern was here #### Vertices: Map 1 @@ -1047,16 +1076,29 @@ STAGE PLANS: Map-reduce partition columns: _col0 (type: smallint) Statistics: Num rows: 245760 Data size: 58159880 Basic stats: COMPLETE Column stats: NONE value expressions: _col2 (type: string) - Execution mode: vectorized, llap + Select Operator + expressions: _col0 (type: smallint) + outputColumnNames: _col0 + Statistics: Num rows: 245760 Data size: 58159880 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: min(_col0), max(_col0), bloom_filter(_col0, expectedEntries=245760) + mode: hash + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: smallint), _col1 (type: smallint), _col2 (type: binary) + Execution mode: llap LLAP IO: all inputs - Map 4 + Map 5 Map Operator Tree: TableScan alias: o2 - filterExpr: (csmallint is not null and cbigint is not null) (type: boolean) + filterExpr: (csmallint is not null and cbigint is not null and csmallint BETWEEN DynamicValue(RS_6_o1_csmallint_min) AND DynamicValue(RS_6_o1_csmallint_max) and in_bloom_filter(csmallint, DynamicValue(RS_6_o1_csmallint_bloom_filter))) (type: boolean) Statistics: Num rows: 245760 Data size: 58159880 Basic stats: COMPLETE Column stats: NONE Filter Operator - predicate: (csmallint is not null and cbigint is not null) (type: boolean) + predicate: (csmallint is not null and cbigint is not null and csmallint BETWEEN DynamicValue(RS_6_o1_csmallint_min) AND DynamicValue(RS_6_o1_csmallint_max) and in_bloom_filter(csmallint, DynamicValue(RS_6_o1_csmallint_bloom_filter))) (type: boolean) Statistics: Num rows: 245760 Data size: 58159880 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: csmallint (type: smallint), cstring2 (type: string) @@ -1068,7 +1110,7 @@ STAGE PLANS: Map-reduce partition columns: _col0 (type: smallint) Statistics: Num rows: 245760 Data size: 58159880 Basic stats: COMPLETE Column stats: NONE value expressions: _col2 (type: string) - Execution mode: vectorized, llap + Execution mode: llap LLAP IO: all inputs Reducer 2 Execution mode: llap @@ -1109,6 +1151,18 @@ STAGE PLANS: input format: org.apache.hadoop.mapred.SequenceFileInputFormat output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Reducer 4 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: min(VALUE._col0), max(VALUE._col1), bloom_filter(VALUE._col2, expectedEntries=245760) + mode: final + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: smallint), _col1 (type: smallint), _col2 (type: binary) Stage: Stage-0 Fetch Operator diff --git a/ql/src/test/results/clientpositive/llap/subquery_scalar.q.out b/ql/src/test/results/clientpositive/llap/subquery_scalar.q.out index 09b0a83..3be4b6a 100644 --- a/ql/src/test/results/clientpositive/llap/subquery_scalar.q.out +++ b/ql/src/test/results/clientpositive/llap/subquery_scalar.q.out @@ -2652,9 +2652,9 @@ POSTHOOK: Input: default@part_null 42669 almond antique medium spring khaki Manufacturer#5 Brand#51 STANDARD BURNISHED TIN 6 MED CAN 1611.66 sits haggl 195606 almond aquamarine sandy cyan gainsboro Manufacturer#2 Brand#25 STANDARD PLATED TIN 18 SM PKG 1701.6 ic de 144293 almond antique olive coral navajo Manufacturer#3 Brand#34 STANDARD POLISHED STEEL 45 JUMBO CAN 1337.29 ag furiously about -Warning: Shuffle Join MERGEJOIN[94][tables = [$hdt$_0, $hdt$_1, $hdt$_2]] in Stage 'Reducer 2' is a cross product -Warning: Shuffle Join MERGEJOIN[91][tables = [$hdt$_4, $hdt$_5]] in Stage 'Reducer 12' is a cross product -Warning: Shuffle Join MERGEJOIN[92][tables = [$hdt$_4, $hdt$_5, $hdt$_6]] in Stage 'Reducer 13' is a cross product +Warning: Shuffle Join MERGEJOIN[91][tables = [$hdt$_0, $hdt$_1, $hdt$_2]] in Stage 'Reducer 2' is a cross product +Warning: Shuffle Join MERGEJOIN[92][tables = [$hdt$_4, $hdt$_5]] in Stage 'Reducer 12' is a cross product +Warning: Shuffle Join MERGEJOIN[93][tables = [$hdt$_4, $hdt$_5, $hdt$_6]] in Stage 'Reducer 13' is a cross product PREHOOK: query: explain select * from part where p_brand <> (select min(p_brand) from part ) AND p_size IN (select (p_size) from part p where p.p_type = part.p_type ) AND p_size <> 340 PREHOOK: type: QUERY POSTHOOK: query: explain select * from part where p_brand <> (select min(p_brand) from part ) AND p_size IN (select (p_size) from part p where p.p_type = part.p_type ) AND p_size <> 340 @@ -3020,9 +3020,9 @@ STAGE PLANS: Processor Tree: ListSink -Warning: Shuffle Join MERGEJOIN[94][tables = [$hdt$_0, $hdt$_1, $hdt$_2]] in Stage 'Reducer 2' is a cross product -Warning: Shuffle Join MERGEJOIN[91][tables = [$hdt$_4, $hdt$_5]] in Stage 'Reducer 12' is a cross product -Warning: Shuffle Join MERGEJOIN[92][tables = [$hdt$_4, $hdt$_5, $hdt$_6]] in Stage 'Reducer 13' is a cross product +Warning: Shuffle Join MERGEJOIN[91][tables = [$hdt$_0, $hdt$_1, $hdt$_2]] in Stage 'Reducer 2' is a cross product +Warning: Shuffle Join MERGEJOIN[92][tables = [$hdt$_4, $hdt$_5]] in Stage 'Reducer 12' is a cross product +Warning: Shuffle Join MERGEJOIN[93][tables = [$hdt$_4, $hdt$_5, $hdt$_6]] in Stage 'Reducer 13' is a cross product PREHOOK: query: select * from part where p_brand <> (select min(p_brand) from part ) AND p_size IN (select (p_size) from part p where p.p_type = part.p_type ) AND p_size <> 340 PREHOOK: type: QUERY PREHOOK: Input: default@part @@ -5581,7 +5581,7 @@ POSTHOOK: Input: default@lineitem 175839 874 182052 9607 Warning: Shuffle Join MERGEJOIN[98][tables = [$hdt$_3, $hdt$_4]] in Stage 'Reducer 9' is a cross product -Warning: Shuffle Join MERGEJOIN[100][tables = [$hdt$_4, $hdt$_5]] in Stage 'Reducer 16' is a cross product +Warning: Shuffle Join MERGEJOIN[99][tables = [$hdt$_4, $hdt$_5]] in Stage 'Reducer 16' is a cross product PREHOOK: query: explain select sum(l_extendedprice) from lineitem, part where p_partkey = l_partkey and l_quantity > (select avg(l_quantity) from lineitem where l_partkey = p_partkey) PREHOOK: type: QUERY POSTHOOK: query: explain select sum(l_extendedprice) from lineitem, part where p_partkey = l_partkey and l_quantity > (select avg(l_quantity) from lineitem where l_partkey = p_partkey) @@ -5946,7 +5946,7 @@ STAGE PLANS: ListSink Warning: Shuffle Join MERGEJOIN[98][tables = [$hdt$_3, $hdt$_4]] in Stage 'Reducer 9' is a cross product -Warning: Shuffle Join MERGEJOIN[100][tables = [$hdt$_4, $hdt$_5]] in Stage 'Reducer 16' is a cross product +Warning: Shuffle Join MERGEJOIN[99][tables = [$hdt$_4, $hdt$_5]] in Stage 'Reducer 16' is a cross product PREHOOK: query: select sum(l_extendedprice) from lineitem, part where p_partkey = l_partkey and l_quantity > (select avg(l_quantity) from lineitem where l_partkey = p_partkey) PREHOOK: type: QUERY PREHOOK: Input: default@lineitem diff --git a/ql/src/test/results/clientpositive/llap/vectorized_dynamic_partition_pruning.q.out b/ql/src/test/results/clientpositive/llap/vectorized_dynamic_partition_pruning.q.out index dc1b60f..3d087b3 100644 --- a/ql/src/test/results/clientpositive/llap/vectorized_dynamic_partition_pruning.q.out +++ b/ql/src/test/results/clientpositive/llap/vectorized_dynamic_partition_pruning.q.out @@ -2784,6 +2784,21 @@ STAGE PLANS: sort order: + Map-reduce partition columns: _col0 (type: string) Statistics: Num rows: 1 Data size: 172 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 172 Basic stats: COMPLETE Column stats: NONE + Group By Operator + keys: _col0 (type: string) + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 172 Basic stats: COMPLETE Column stats: NONE + Dynamic Partitioning Event Operator + Target column: hr (string) + Target Input: srcpart + Partition key expr: hr + Statistics: Num rows: 1 Data size: 172 Basic stats: COMPLETE Column stats: NONE + Target Vertex: Map 1 Execution mode: vectorized, llap LLAP IO: all inputs Reducer 2 @@ -2877,9 +2892,11 @@ STAGE PLANS: Tez #### A masked pattern was here #### Edges: - Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 5 (SIMPLE_EDGE) - Reducer 3 <- Map 6 (SIMPLE_EDGE), Reducer 2 (SIMPLE_EDGE) + Map 7 <- Reducer 5 (BROADCAST_EDGE) + Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 6 (SIMPLE_EDGE) + Reducer 3 <- Map 7 (SIMPLE_EDGE), Reducer 2 (SIMPLE_EDGE) Reducer 4 <- Reducer 3 (CUSTOM_SIMPLE_EDGE) + Reducer 5 <- Reducer 2 (CUSTOM_SIMPLE_EDGE) #### A masked pattern was here #### Vertices: Map 1 @@ -2903,7 +2920,7 @@ STAGE PLANS: value expressions: _col1 (type: string) Execution mode: vectorized, llap LLAP IO: unknown - Map 5 + Map 6 Map Operator Tree: TableScan alias: srcpart_date @@ -2923,14 +2940,14 @@ STAGE PLANS: Statistics: Num rows: 1 Data size: 188 Basic stats: COMPLETE Column stats: NONE Execution mode: vectorized, llap LLAP IO: all inputs - Map 6 + Map 7 Map Operator Tree: TableScan alias: srcpart_hour - filterExpr: (UDFToDouble(hr) = 13.0) (type: boolean) + filterExpr: ((UDFToDouble(hr) = 13.0) and hr BETWEEN DynamicValue(RS_12_srcpart_hr_min) AND DynamicValue(RS_12_srcpart_hr_max) and in_bloom_filter(hr, DynamicValue(RS_12_srcpart_hr_bloom_filter))) (type: boolean) Statistics: Num rows: 2 Data size: 344 Basic stats: COMPLETE Column stats: NONE Filter Operator - predicate: (UDFToDouble(hr) = 13.0) (type: boolean) + predicate: ((UDFToDouble(hr) = 13.0) and hr BETWEEN DynamicValue(RS_12_srcpart_hr_min) AND DynamicValue(RS_12_srcpart_hr_max) and in_bloom_filter(hr, DynamicValue(RS_12_srcpart_hr_bloom_filter))) (type: boolean) Statistics: Num rows: 1 Data size: 172 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: hr (type: string) @@ -2941,7 +2958,7 @@ STAGE PLANS: sort order: + Map-reduce partition columns: _col0 (type: string) Statistics: Num rows: 1 Data size: 172 Basic stats: COMPLETE Column stats: NONE - Execution mode: vectorized, llap + Execution mode: llap LLAP IO: all inputs Reducer 2 Execution mode: llap @@ -2959,6 +2976,19 @@ STAGE PLANS: sort order: + Map-reduce partition columns: _col1 (type: string) Statistics: Num rows: 1 Data size: 404 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col1 (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 404 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: min(_col0), max(_col0), bloom_filter(_col0, expectedEntries=1) + mode: hash + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: string), _col1 (type: string), _col2 (type: binary) Reducer 3 Execution mode: llap Reduce Operator Tree: @@ -2993,6 +3023,18 @@ STAGE PLANS: input format: org.apache.hadoop.mapred.SequenceFileInputFormat output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Reducer 5 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: min(VALUE._col0), max(VALUE._col1), bloom_filter(VALUE._col2, expectedEntries=1) + mode: final + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: string), _col1 (type: string), _col2 (type: binary) Stage: Stage-0 Fetch Operator @@ -5036,6 +5078,21 @@ STAGE PLANS: sort order: + Map-reduce partition columns: _col0 (type: string) Statistics: Num rows: 1 Data size: 172 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 172 Basic stats: COMPLETE Column stats: NONE + Group By Operator + keys: _col0 (type: string) + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 172 Basic stats: COMPLETE Column stats: NONE + Dynamic Partitioning Event Operator + Target column: hr (string) + Target Input: srcpart + Partition key expr: hr + Statistics: Num rows: 1 Data size: 172 Basic stats: COMPLETE Column stats: NONE + Target Vertex: Map 1 Execution mode: vectorized, llap LLAP IO: all inputs Reducer 2 diff --git a/ql/src/test/results/clientpositive/perf/query16.q.out b/ql/src/test/results/clientpositive/perf/query16.q.out index 1648ddb..1be5fba 100644 --- a/ql/src/test/results/clientpositive/perf/query16.q.out +++ b/ql/src/test/results/clientpositive/perf/query16.q.out @@ -1,5 +1,5 @@ Warning: Shuffle Join MERGEJOIN[141][tables = [$hdt$_2, $hdt$_3, $hdt$_1, $hdt$_4]] in Stage 'Reducer 13' is a cross product -Warning: Shuffle Join MERGEJOIN[143][tables = [$hdt$_2, $hdt$_3, $hdt$_1, $hdt$_4]] in Stage 'Reducer 25' is a cross product +Warning: Shuffle Join MERGEJOIN[142][tables = [$hdt$_2, $hdt$_3, $hdt$_1, $hdt$_4]] in Stage 'Reducer 25' is a cross product PREHOOK: query: explain select count(distinct cs_order_number) as `order count` ,sum(cs_ext_ship_cost) as `total shipping cost` @@ -117,7 +117,7 @@ Stage-0 Output:["_col2","_col3"] Filter Operator [FIL_63] (rows=5072854730221289472 width=1) predicate:(_col2 <> _col0) - Merge Join Operator [MERGEJOIN_144] (rows=5072854730221289472 width=1) + Merge Join Operator [MERGEJOIN_147] (rows=5072854730221289472 width=1) Conds:RS_60._col1=RS_61._col1(Inner),Output:["_col0","_col2","_col3"] <-Map 21 [SIMPLE_EDGE] SHUFFLE [RS_60] @@ -140,7 +140,7 @@ Stage-0 Output:["_col0","_col1"],keys:_col4, _col3 Select Operator [SEL_55] (rows=9223372036854775807 width=1) Output:["_col4","_col3"] - Merge Join Operator [MERGEJOIN_143] (rows=9223372036854775807 width=1) + Merge Join Operator [MERGEJOIN_142] (rows=9223372036854775807 width=1) Conds:(Inner),(Inner),(Inner),Output:["_col3","_col4"] <-Map 24 [CUSTOM_SIMPLE_EDGE] PARTITION_ONLY_SHUFFLE [RS_50] @@ -180,7 +180,7 @@ Stage-0 PartitionCols:_col0 Group By Operator [GBY_36] (rows=5072854730221289472 width=1) Output:["_col0"],keys:_col0 - Merge Join Operator [MERGEJOIN_142] (rows=5072854730221289472 width=1) + Merge Join Operator [MERGEJOIN_146] (rows=5072854730221289472 width=1) Conds:RS_32._col0=RS_33._col0(Inner),Output:["_col0"] <-Map 20 [SIMPLE_EDGE] SHUFFLE [RS_33] @@ -225,7 +225,7 @@ Stage-0 <-Reducer 4 [SIMPLE_EDGE] SHUFFLE [RS_78] PartitionCols:_col4 - Merge Join Operator [MERGEJOIN_147] (rows=383314495 width=135) + Merge Join Operator [MERGEJOIN_145] (rows=383314495 width=135) Conds:RS_75._col2=RS_76._col0(Inner),Output:["_col3","_col4","_col5","_col6"] <-Map 11 [SIMPLE_EDGE] SHUFFLE [RS_76] @@ -239,7 +239,7 @@ Stage-0 <-Reducer 3 [SIMPLE_EDGE] SHUFFLE [RS_75] PartitionCols:_col2 - Merge Join Operator [MERGEJOIN_146] (rows=348467716 width=135) + Merge Join Operator [MERGEJOIN_144] (rows=348467716 width=135) Conds:RS_72._col1=RS_73._col0(Inner),Output:["_col2","_col3","_col4","_col5","_col6"] <-Map 10 [SIMPLE_EDGE] SHUFFLE [RS_73] @@ -253,7 +253,7 @@ Stage-0 <-Reducer 2 [SIMPLE_EDGE] SHUFFLE [RS_72] PartitionCols:_col1 - Merge Join Operator [MERGEJOIN_145] (rows=316788826 width=135) + Merge Join Operator [MERGEJOIN_143] (rows=316788826 width=135) Conds:RS_69._col0=RS_70._col0(Inner),Output:["_col1","_col2","_col3","_col4","_col5","_col6"] <-Map 1 [SIMPLE_EDGE] SHUFFLE [RS_69] diff --git a/ql/src/test/results/clientpositive/perf/query6.q.out b/ql/src/test/results/clientpositive/perf/query6.q.out index 5500e7a..094459e 100644 --- a/ql/src/test/results/clientpositive/perf/query6.q.out +++ b/ql/src/test/results/clientpositive/perf/query6.q.out @@ -1,4 +1,4 @@ -Warning: Shuffle Join MERGEJOIN[197][tables = [$hdt$_0, $hdt$_1]] in Stage 'Reducer 2' is a cross product +Warning: Shuffle Join MERGEJOIN[196][tables = [$hdt$_0, $hdt$_1]] in Stage 'Reducer 2' is a cross product Warning: Shuffle Join MERGEJOIN[193][tables = [$hdt$_5, $hdt$_6]] in Stage 'Reducer 21' is a cross product Warning: Shuffle Join MERGEJOIN[194][tables = [$hdt$_5, $hdt$_6, $hdt$_7]] in Stage 'Reducer 22' is a cross product Warning: Shuffle Join MERGEJOIN[195][tables = [$hdt$_5, $hdt$_6, $hdt$_7, $hdt$_8, $hdt$_9, $hdt$_10, $hdt$_11]] in Stage 'Reducer 23' is a cross product @@ -160,7 +160,7 @@ Stage-0 Output:["_col0","_col1"],aggregations:["avg(_col0)"],keys:_col2 Select Operator [SEL_87] (rows=5072854730221289472 width=1) Output:["_col2","_col0"] - Merge Join Operator [MERGEJOIN_196] (rows=5072854730221289472 width=1) + Merge Join Operator [MERGEJOIN_198] (rows=5072854730221289472 width=1) Conds:RS_84._col1=RS_85._col0(Inner),Output:["_col0","_col2"] <-Map 17 [SIMPLE_EDGE] SHUFFLE [RS_84] @@ -264,7 +264,7 @@ Stage-0 <-Reducer 12 [SIMPLE_EDGE] SHUFFLE [RS_110] PartitionCols:_col0 - Merge Join Operator [MERGEJOIN_198] (rows=80353 width=1119) + Merge Join Operator [MERGEJOIN_197] (rows=80353 width=1119) Conds:RS_27._col1=RS_28._col0(Inner),Output:["_col0"] <-Map 11 [SIMPLE_EDGE] SHUFFLE [RS_27] @@ -294,7 +294,7 @@ Stage-0 <-Reducer 2 [SIMPLE_EDGE] SHUFFLE [RS_109] PartitionCols:_col0 - Merge Join Operator [MERGEJOIN_197] (rows=575995635 width=97) + Merge Join Operator [MERGEJOIN_196] (rows=575995635 width=97) Conds:(Inner),Output:["_col0","_col1","_col2"] <-Map 1 [CUSTOM_SIMPLE_EDGE] PARTITION_ONLY_SHUFFLE [RS_106] diff --git a/ql/src/test/results/clientpositive/perf/query83.q.out b/ql/src/test/results/clientpositive/perf/query83.q.out index 004dc41..ee448d4 100644 --- a/ql/src/test/results/clientpositive/perf/query83.q.out +++ b/ql/src/test/results/clientpositive/perf/query83.q.out @@ -190,7 +190,7 @@ Stage-0 <-Reducer 16 [SIMPLE_EDGE] SHUFFLE [RS_76] PartitionCols:_col0 - Merge Join Operator [MERGEJOIN_224] (rows=63350266 width=77) + Merge Join Operator [MERGEJOIN_222] (rows=63350266 width=77) Conds:RS_73._col1=RS_74._col0(Inner),Output:["_col0","_col2","_col4"] <-Map 15 [SIMPLE_EDGE] SHUFFLE [RS_73] @@ -213,7 +213,7 @@ Stage-0 <-Reducer 21 [SIMPLE_EDGE] SHUFFLE [RS_77] PartitionCols:_col0 - Merge Join Operator [MERGEJOIN_225] (rows=80353 width=1119) + Merge Join Operator [MERGEJOIN_224] (rows=80353 width=1119) Conds:RS_69._col1=RS_70._col0(Inner),Output:["_col0"] <-Map 20 [SIMPLE_EDGE] SHUFFLE [RS_69] @@ -234,7 +234,7 @@ Stage-0 PartitionCols:_col0 Group By Operator [GBY_65] (rows=80353 width=1119) Output:["_col0"],keys:_col0 - Merge Join Operator [MERGEJOIN_220] (rows=80353 width=1119) + Merge Join Operator [MERGEJOIN_223] (rows=80353 width=1119) Conds:RS_61._col1=RS_62._col0(Inner),Output:["_col0"] <-Map 22 [SIMPLE_EDGE] SHUFFLE [RS_61] @@ -278,7 +278,7 @@ Stage-0 <-Reducer 28 [SIMPLE_EDGE] SHUFFLE [RS_119] PartitionCols:_col0 - Merge Join Operator [MERGEJOIN_226] (rows=15838314 width=92) + Merge Join Operator [MERGEJOIN_225] (rows=15838314 width=92) Conds:RS_116._col1=RS_117._col0(Inner),Output:["_col0","_col2","_col4"] <-Map 27 [SIMPLE_EDGE] SHUFFLE [RS_116] @@ -322,7 +322,7 @@ Stage-0 PartitionCols:_col0 Group By Operator [GBY_108] (rows=80353 width=1119) Output:["_col0"],keys:_col0 - Merge Join Operator [MERGEJOIN_221] (rows=80353 width=1119) + Merge Join Operator [MERGEJOIN_226] (rows=80353 width=1119) Conds:RS_104._col1=RS_105._col0(Inner),Output:["_col0"] <-Map 34 [SIMPLE_EDGE] SHUFFLE [RS_104] @@ -366,7 +366,7 @@ Stage-0 <-Reducer 2 [SIMPLE_EDGE] SHUFFLE [RS_33] PartitionCols:_col0 - Merge Join Operator [MERGEJOIN_222] (rows=31678769 width=106) + Merge Join Operator [MERGEJOIN_219] (rows=31678769 width=106) Conds:RS_30._col1=RS_31._col0(Inner),Output:["_col0","_col2","_col4"] <-Map 1 [SIMPLE_EDGE] SHUFFLE [RS_30] @@ -389,7 +389,7 @@ Stage-0 <-Reducer 9 [SIMPLE_EDGE] SHUFFLE [RS_34] PartitionCols:_col0 - Merge Join Operator [MERGEJOIN_223] (rows=80353 width=1119) + Merge Join Operator [MERGEJOIN_221] (rows=80353 width=1119) Conds:RS_26._col1=RS_27._col0(Inner),Output:["_col0"] <-Map 8 [SIMPLE_EDGE] SHUFFLE [RS_26] @@ -410,7 +410,7 @@ Stage-0 PartitionCols:_col0 Group By Operator [GBY_22] (rows=80353 width=1119) Output:["_col0"],keys:_col0 - Merge Join Operator [MERGEJOIN_219] (rows=80353 width=1119) + Merge Join Operator [MERGEJOIN_220] (rows=80353 width=1119) Conds:RS_18._col1=RS_19._col0(Inner),Output:["_col0"] <-Map 10 [SIMPLE_EDGE] SHUFFLE [RS_18] diff --git a/ql/src/test/results/clientpositive/show_functions.q.out b/ql/src/test/results/clientpositive/show_functions.q.out index 90b86c3..b8daea9 100644 --- a/ql/src/test/results/clientpositive/show_functions.q.out +++ b/ql/src/test/results/clientpositive/show_functions.q.out @@ -36,6 +36,7 @@ avg base64 between bin +bloom_filter bround case cbrt @@ -109,6 +110,7 @@ histogram_numeric hour if in +in_bloom_filter in_file index initcap diff --git a/ql/src/test/results/clientpositive/tez/explainanalyze_3.q.out b/ql/src/test/results/clientpositive/tez/explainanalyze_3.q.out index d15c83f..9f1a401 100644 --- a/ql/src/test/results/clientpositive/tez/explainanalyze_3.q.out +++ b/ql/src/test/results/clientpositive/tez/explainanalyze_3.q.out @@ -820,34 +820,101 @@ POSTHOOK: query: explain analyze select a.key, a.value, b.value from tab a join tab_part b on a.key = b.key POSTHOOK: type: QUERY -Plan optimized by CBO. - -Vertex dependency in root stage -Map 2 <- Map 1 (CUSTOM_EDGE) - -Stage-0 - Fetch Operator - limit:-1 - Stage-1 - Map 2 - File Output Operator [FS_10] - Select Operator [SEL_9] (rows=550/480 width=18) - Output:["_col0","_col1","_col2"] - Map Join Operator [MAPJOIN_15] (rows=550/480 width=18) - BucketMapJoin:true,Conds:RS_6._col0=SEL_5._col0(Inner),HybridGraceHashJoin:true,Output:["_col0","_col1","_col3"] - <-Map 1 [CUSTOM_EDGE] - MULTICAST [RS_6] - PartitionCols:_col0 - Select Operator [SEL_2] (rows=242/242 width=18) - Output:["_col0","_col1"] - Filter Operator [FIL_13] (rows=242/242 width=18) - predicate:key is not null - TableScan [TS_0] (rows=242/242 width=18) - default@tab,a,Tbl:COMPLETE,Col:NONE,Output:["key","value"] - <-Select Operator [SEL_5] (rows=500/500 width=18) - Output:["_col0","_col1"] - Filter Operator [FIL_14] (rows=500/500 width=18) - predicate:key is not null - TableScan [TS_3] (rows=500/500 width=18) - default@tab_part,b,Tbl:COMPLETE,Col:NONE,Output:["key","value"] +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Map 3 <- Map 1 (CUSTOM_EDGE), Reducer 2 (BROADCAST_EDGE) + Reducer 2 <- Map 1 (CUSTOM_SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: a + filterExpr: key is not null (type: boolean) + Statistics: Num rows: 242/242 Data size: 4502 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: key is not null (type: boolean) + Statistics: Num rows: 242/242 Data size: 4502 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: int), value (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 242/242 Data size: 4502 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: int) + sort order: + + Map-reduce partition columns: _col0 (type: int) + Statistics: Num rows: 242/242 Data size: 4502 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: string) + Select Operator + expressions: _col0 (type: int) + outputColumnNames: _col0 + Statistics: Num rows: 242/242 Data size: 4502 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: min(_col0), max(_col0), bloom_filter(_col0, expectedEntries=242) + mode: hash + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1/3 Data size: 12 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1/3 Data size: 12 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: int), _col1 (type: int), _col2 (type: binary) + Map 3 + Map Operator Tree: + TableScan + alias: b + filterExpr: (key is not null and key BETWEEN DynamicValue(RS_6_a_key_min) AND DynamicValue(RS_6_a_key_max) and in_bloom_filter(key, DynamicValue(RS_6_a_key_bloom_filter))) (type: boolean) + Statistics: Num rows: 500/500 Data size: 9312 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (key is not null and key BETWEEN DynamicValue(RS_6_a_key_min) AND DynamicValue(RS_6_a_key_max) and in_bloom_filter(key, DynamicValue(RS_6_a_key_bloom_filter))) (type: boolean) + Statistics: Num rows: 500/244 Data size: 9312 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: int), value (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 500/244 Data size: 9312 Basic stats: COMPLETE Column stats: NONE + Map Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 _col0 (type: int) + 1 _col0 (type: int) + outputColumnNames: _col0, _col1, _col3 + input vertices: + 0 Map 1 + Statistics: Num rows: 550/480 Data size: 10243 Basic stats: COMPLETE Column stats: NONE + HybridGraceHashJoin: true + Select Operator + expressions: _col0 (type: int), _col1 (type: string), _col3 (type: string) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 550/480 Data size: 10243 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 550/480 Data size: 10243 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Reducer 2 + Reduce Operator Tree: + Group By Operator + aggregations: min(VALUE._col0), max(VALUE._col1), bloom_filter(VALUE._col2, expectedEntries=242) + mode: final + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1/1 Data size: 12 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1/1 Data size: 12 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: int), _col1 (type: int), _col2 (type: binary) + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink diff --git a/ql/src/test/results/clientpositive/tez/explainuser_3.q.out b/ql/src/test/results/clientpositive/tez/explainuser_3.q.out index 2db8b5e..fbf61ef 100644 --- a/ql/src/test/results/clientpositive/tez/explainuser_3.q.out +++ b/ql/src/test/results/clientpositive/tez/explainuser_3.q.out @@ -31,13 +31,13 @@ Stage-0 Stage-1 Reducer 2 vectorized File Output Operator [FS_8] - Select Operator [SEL_7] (rows=16 width=107) + Select Operator [SEL_7] (rows=16 width=106) Output:["_col0","_col1"] <-Map 1 [SIMPLE_EDGE] vectorized SHUFFLE [RS_6] - Select Operator [SEL_5] (rows=16 width=107) + Select Operator [SEL_5] (rows=16 width=106) Output:["_col0","_col1"] - TableScan [TS_0] (rows=16 width=107) + TableScan [TS_0] (rows=16 width=106) default@acid_vectorized,acid_vectorized, ACID table,Tbl:COMPLETE,Col:NONE,Output:["a","b"] PREHOOK: query: explain select key, value @@ -640,34 +640,101 @@ POSTHOOK: query: explain select a.key, a.value, b.value from tab a join tab_part b on a.key = b.key POSTHOOK: type: QUERY -Plan optimized by CBO. - -Vertex dependency in root stage -Map 2 <- Map 1 (CUSTOM_EDGE) +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 -Stage-0 - Fetch Operator - limit:-1 - Stage-1 - Map 2 - File Output Operator [FS_10] - Select Operator [SEL_9] (rows=550 width=18) - Output:["_col0","_col1","_col2"] - Map Join Operator [MAPJOIN_15] (rows=550 width=18) - BucketMapJoin:true,Conds:RS_6._col0=SEL_5._col0(Inner),HybridGraceHashJoin:true,Output:["_col0","_col1","_col3"] - <-Map 1 [CUSTOM_EDGE] - MULTICAST [RS_6] - PartitionCols:_col0 - Select Operator [SEL_2] (rows=242 width=18) - Output:["_col0","_col1"] - Filter Operator [FIL_13] (rows=242 width=18) - predicate:key is not null - TableScan [TS_0] (rows=242 width=18) - default@tab,a,Tbl:COMPLETE,Col:NONE,Output:["key","value"] - <-Select Operator [SEL_5] (rows=500 width=18) - Output:["_col0","_col1"] - Filter Operator [FIL_14] (rows=500 width=18) - predicate:key is not null - TableScan [TS_3] (rows=500 width=18) - default@tab_part,b,Tbl:COMPLETE,Col:NONE,Output:["key","value"] +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Map 3 <- Map 1 (CUSTOM_EDGE), Reducer 2 (BROADCAST_EDGE) + Reducer 2 <- Map 1 (CUSTOM_SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: a + filterExpr: key is not null (type: boolean) + Statistics: Num rows: 242 Data size: 4502 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: key is not null (type: boolean) + Statistics: Num rows: 242 Data size: 4502 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: int), value (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 242 Data size: 4502 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: int) + sort order: + + Map-reduce partition columns: _col0 (type: int) + Statistics: Num rows: 242 Data size: 4502 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: string) + Select Operator + expressions: _col0 (type: int) + outputColumnNames: _col0 + Statistics: Num rows: 242 Data size: 4502 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: min(_col0), max(_col0), bloom_filter(_col0, expectedEntries=242) + mode: hash + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: int), _col1 (type: int), _col2 (type: binary) + Map 3 + Map Operator Tree: + TableScan + alias: b + filterExpr: (key is not null and key BETWEEN DynamicValue(RS_6_a_key_min) AND DynamicValue(RS_6_a_key_max) and in_bloom_filter(key, DynamicValue(RS_6_a_key_bloom_filter))) (type: boolean) + Statistics: Num rows: 500 Data size: 9312 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (key is not null and key BETWEEN DynamicValue(RS_6_a_key_min) AND DynamicValue(RS_6_a_key_max) and in_bloom_filter(key, DynamicValue(RS_6_a_key_bloom_filter))) (type: boolean) + Statistics: Num rows: 500 Data size: 9312 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: int), value (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 500 Data size: 9312 Basic stats: COMPLETE Column stats: NONE + Map Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 _col0 (type: int) + 1 _col0 (type: int) + outputColumnNames: _col0, _col1, _col3 + input vertices: + 0 Map 1 + Statistics: Num rows: 550 Data size: 10243 Basic stats: COMPLETE Column stats: NONE + HybridGraceHashJoin: true + Select Operator + expressions: _col0 (type: int), _col1 (type: string), _col3 (type: string) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 550 Data size: 10243 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 550 Data size: 10243 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Reducer 2 + Reduce Operator Tree: + Group By Operator + aggregations: min(VALUE._col0), max(VALUE._col1), bloom_filter(VALUE._col2, expectedEntries=242) + mode: final + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: int), _col1 (type: int), _col2 (type: binary) + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink diff --git a/storage-api/src/java/org/apache/hadoop/hive/ql/io/sarg/LiteralDelegate.java b/storage-api/src/java/org/apache/hadoop/hive/ql/io/sarg/LiteralDelegate.java new file mode 100644 index 0000000..bd8a5ce --- /dev/null +++ b/storage-api/src/java/org/apache/hadoop/hive/ql/io/sarg/LiteralDelegate.java @@ -0,0 +1,31 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.io.sarg; + +import org.apache.hadoop.conf.Configurable; + +/** + * Interface to retrieve a literal value + */ +public interface LiteralDelegate extends Configurable { + + Object getLiteral(); + + String getId(); +} diff --git a/storage-api/src/java/org/apache/hadoop/hive/ql/io/sarg/SearchArgumentFactory.java b/storage-api/src/java/org/apache/hadoop/hive/ql/io/sarg/SearchArgumentFactory.java index 8fda95c..3c10c83 100644 --- a/storage-api/src/java/org/apache/hadoop/hive/ql/io/sarg/SearchArgumentFactory.java +++ b/storage-api/src/java/org/apache/hadoop/hive/ql/io/sarg/SearchArgumentFactory.java @@ -18,13 +18,20 @@ package org.apache.hadoop.hive.ql.io.sarg; +import org.apache.hadoop.conf.Configuration; + /** * A factory for creating SearchArguments, as well as modifying those created by this factory. */ public class SearchArgumentFactory { public static SearchArgument.Builder newBuilder() { - return new SearchArgumentImpl.BuilderImpl(); + return newBuilder(null); + } + + public static SearchArgument.Builder newBuilder(Configuration conf) { + return new SearchArgumentImpl.BuilderImpl(conf); } + public static void setPredicateLeafColumn(PredicateLeaf leaf, String newName) { SearchArgumentImpl.PredicateLeafImpl.setColumnName(leaf, newName); } diff --git a/storage-api/src/java/org/apache/hadoop/hive/ql/io/sarg/SearchArgumentImpl.java b/storage-api/src/java/org/apache/hadoop/hive/ql/io/sarg/SearchArgumentImpl.java index 10d8c51..db0a582 100644 --- a/storage-api/src/java/org/apache/hadoop/hive/ql/io/sarg/SearchArgumentImpl.java +++ b/storage-api/src/java/org/apache/hadoop/hive/ql/io/sarg/SearchArgumentImpl.java @@ -31,6 +31,8 @@ import java.util.Queue; import java.util.Set; +import org.apache.hadoop.conf.Configuration; + /** * The implementation of SearchArguments. Visible for testing only. */ @@ -57,27 +59,17 @@ public PredicateLeafImpl(Operator operator, Type type, String columnName, Object literal, - List literalList) { + List literalList, Configuration conf) { this.operator = operator; this.type = type; this.columnName = columnName; this.literal = literal; - if (literal != null) { - if (literal.getClass() != type.getValueClass()) { - throw new IllegalArgumentException("Wrong value class " + - literal.getClass().getName() + " for " + type + "." + operator + - " leaf"); - } - } + checkLiteralType(literal, type, conf); this.literalList = literalList; if (literalList != null) { Class valueCls = type.getValueClass(); for(Object lit: literalList) { - if (lit != null && lit.getClass() != valueCls) { - throw new IllegalArgumentException("Wrong value class item " + - lit.getClass().getName() + " for " + type + "." + operator + - " leaf"); - } + checkLiteralType(lit, type, conf); } } } @@ -99,6 +91,10 @@ public String getColumnName() { @Override public Object getLiteral() { + if (literal instanceof LiteralDelegate) { + return ((LiteralDelegate) literal).getLiteral(); + } + // To get around a kryo 2.22 bug while deserialize a Timestamp into Date // (https://github.com/EsotericSoftware/kryo/issues/88) // When we see a Date, convert back into Timestamp @@ -110,6 +106,13 @@ public Object getLiteral() { @Override public List getLiteralList() { + if (literalList != null && literalList.size() > 0 && literalList.get(0) instanceof LiteralDelegate) { + List newLiteraList = new ArrayList(); + for (Object litertalObj : literalList) { + newLiteraList.add(((LiteralDelegate) litertalObj).getLiteral()); + } + return newLiteraList; + } return literalList; } @@ -169,6 +172,23 @@ public static void setColumnName(PredicateLeaf leaf, String newName) { assert leaf instanceof PredicateLeafImpl; ((PredicateLeafImpl)leaf).columnName = newName; } + + protected void checkLiteralType(Object literal, Type type, Configuration conf) { + if (literal == null) { + return; + } + + if (literal instanceof LiteralDelegate) { + // Give it a pass. Optionally, have LiteralDelegate provide a getLiteralClass() to check. + ((LiteralDelegate) literal).setConf(conf); + } else { + if (literal.getClass() != type.getValueClass()) { + throw new IllegalArgumentException("Wrong value class " + + literal.getClass().getName() + " for " + type + "." + operator + + " leaf"); + } + } + } } private final List leaves; @@ -218,6 +238,11 @@ public String toString() { static class BuilderImpl implements Builder { + Configuration conf; + public BuilderImpl(Configuration conf) { + this.conf = conf; + } + // max threshold for CNF conversion. having >8 elements in andList will be // converted to maybe private static final int CNF_COMBINATIONS_THRESHOLD = 256; @@ -291,7 +316,7 @@ public Builder lessThan(String column, PredicateLeaf.Type type, } else { PredicateLeaf leaf = new PredicateLeafImpl(PredicateLeaf.Operator.LESS_THAN, - type, column, literal, null); + type, column, literal, null, conf); parent.getChildren().add(new ExpressionTree(addLeaf(leaf))); } return this; @@ -306,7 +331,7 @@ public Builder lessThanEquals(String column, PredicateLeaf.Type type, } else { PredicateLeaf leaf = new PredicateLeafImpl(PredicateLeaf.Operator.LESS_THAN_EQUALS, - type, column, literal, null); + type, column, literal, null, conf); parent.getChildren().add(new ExpressionTree(addLeaf(leaf))); } return this; @@ -321,7 +346,7 @@ public Builder equals(String column, PredicateLeaf.Type type, } else { PredicateLeaf leaf = new PredicateLeafImpl(PredicateLeaf.Operator.EQUALS, - type, column, literal, null); + type, column, literal, null, conf); parent.getChildren().add(new ExpressionTree(addLeaf(leaf))); } return this; @@ -336,7 +361,7 @@ public Builder nullSafeEquals(String column, PredicateLeaf.Type type, } else { PredicateLeaf leaf = new PredicateLeafImpl(PredicateLeaf.Operator.NULL_SAFE_EQUALS, - type, column, literal, null); + type, column, literal, null, conf); parent.getChildren().add(new ExpressionTree(addLeaf(leaf))); } return this; @@ -358,7 +383,7 @@ public Builder in(String column, PredicateLeaf.Type type, PredicateLeaf leaf = new PredicateLeafImpl(PredicateLeaf.Operator.IN, - type, column, null, argList); + type, column, null, argList, conf); parent.getChildren().add(new ExpressionTree(addLeaf(leaf))); } return this; @@ -372,7 +397,7 @@ public Builder isNull(String column, PredicateLeaf.Type type) { } else { PredicateLeaf leaf = new PredicateLeafImpl(PredicateLeaf.Operator.IS_NULL, - type, column, null, null); + type, column, null, null, conf); parent.getChildren().add(new ExpressionTree(addLeaf(leaf))); } return this; @@ -390,7 +415,7 @@ public Builder between(String column, PredicateLeaf.Type type, Object lower, argList.add(upper); PredicateLeaf leaf = new PredicateLeafImpl(PredicateLeaf.Operator.BETWEEN, - type, column, null, argList); + type, column, null, argList, conf); parent.getChildren().add(new ExpressionTree(addLeaf(leaf))); } return this; diff --git a/storage-api/src/java/org/apache/hive/common/util/BloomFilter.java b/storage-api/src/java/org/apache/hive/common/util/BloomFilter.java index e60690d..d44bba8 100644 --- a/storage-api/src/java/org/apache/hive/common/util/BloomFilter.java +++ b/storage-api/src/java/org/apache/hive/common/util/BloomFilter.java @@ -18,6 +18,8 @@ package org.apache.hive.common.util; +import java.io.*; +import java.util.ArrayList; import java.util.Arrays; import java.util.List; @@ -242,6 +244,55 @@ public void reset() { } /** + * Serialize a bloom filter + * @param out output stream to write to + * @param bloomFilter BloomFilter that needs to be seralized + */ + public static void serialize(OutputStream out, BloomFilter bloomFilter) throws IOException { + /** + * Serialized BloomFilter format: + * 1 byte for the number of hash functions. + * 1 big endian int(That is how OutputStream works) for the number of longs in the bitset + * big endina longs in the BloomFilter bitset + */ + DataOutputStream dataOutputStream = new DataOutputStream(out); + dataOutputStream.writeByte(bloomFilter.numHashFunctions); + dataOutputStream.writeInt(bloomFilter.numBits); + for (long value : bloomFilter.getBitSet()) { + dataOutputStream.writeLong(value); + } + } + + /** + * Deserialize a bloom filter + * Read a byte stream, which was written by {@linkplain #serialize(OutputStream, BloomFilter)} + * into a {@code BloomFilter} + * @param in input bytestream + * @return deserialized BloomFilter + */ + public static BloomFilter deserialize(InputStream in) throws IOException { + if (in == null) { + throw new IOException("Input stream is null"); + } + + try { + DataInputStream dataInputStream = new DataInputStream(in); + int numHashFunc = dataInputStream.readByte(); + int numBits = dataInputStream.readInt(); + int sz = (numBits/Long.SIZE); + List data = new ArrayList(); + for (int i = 0; i < sz; i++) { + data.add(dataInputStream.readLong()); + } + return new BloomFilter(data, numBits, numHashFunc); + } catch (RuntimeException e) { + IOException io = new IOException( "Unable to deserialize BloomFilter"); + io.initCause(e); + throw io; + } + } + + /** * Bare metal bit set implementation. For performance reasons, this implementation does not check * for index bounds nor expand the bit set size if the specified index is greater than the size. */