diff --git a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java index 7ceb322..3fd3cd9 100644 --- a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java +++ b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java @@ -2844,6 +2844,9 @@ private static void populateLlapDaemonVarsSet(Set llapDaemonVarsSetLocal TEZ_DYNAMIC_PARTITION_PRUNING_MAX_DATA_SIZE("hive.tez.dynamic.partition.pruning.max.data.size", 100*1024*1024L, "Maximum total data size of events in dynamic pruning."), + TEZ_DYNAMIC_SEMIJOIN_REDUCTION("hive.tez.dynamic.semijoin.reduction", true, + "When dynamic semijoin is enabled, shuffle joins will perform a leaky semijoin before shuffle. This " + + "requires hive.tez.dynamic.partition.pruning to be enabled."), TEZ_SMB_NUMBER_WAVES( "hive.tez.smb.number.waves", (float) 0.5, diff --git a/itests/src/test/resources/testconfiguration.properties b/itests/src/test/resources/testconfiguration.properties index be5a747..4d4f865 100644 --- a/itests/src/test/resources/testconfiguration.properties +++ b/itests/src/test/resources/testconfiguration.properties @@ -154,6 +154,7 @@ minillaplocal.shared.query.files=alter_merge_2_orc.q,\ delete_whole_partition.q,\ disable_merge_for_bucketing.q,\ dynamic_partition_pruning.q,\ + dynamic_semijoin_reduction.q,\ dynpart_sort_opt_vectorization.q,\ dynpart_sort_optimization.q,\ dynpart_sort_optimization2.q,\ @@ -479,6 +480,7 @@ minillaplocal.query.files=acid_globallimit.q,\ correlationoptimizer6.q,\ disable_merge_for_bucketing.q,\ dynamic_partition_pruning.q,\ + dynamic_semijoin_reduction.q,\ dynpart_sort_opt_vectorization.q,\ dynpart_sort_optimization.q,\ dynpart_sort_optimization_acid.q,\ diff --git a/orc/src/test/org/apache/orc/impl/TestRecordReaderImpl.java b/orc/src/test/org/apache/orc/impl/TestRecordReaderImpl.java index cdd62ac..30b42ee 100644 --- a/orc/src/test/org/apache/orc/impl/TestRecordReaderImpl.java +++ b/orc/src/test/org/apache/orc/impl/TestRecordReaderImpl.java @@ -76,7 +76,7 @@ public static PredicateLeaf createPredicateLeaf(PredicateLeaf.Operator operator, Object literal, List literalList) { return new SearchArgumentImpl.PredicateLeafImpl(operator, type, columnName, - literal, literalList); + literal, literalList, null); } // can add .verboseLogging() to cause Mockito to log invocations diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/AbstractMapJoinOperator.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/AbstractMapJoinOperator.java index 69ba4a2..669e23e 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/AbstractMapJoinOperator.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/AbstractMapJoinOperator.java @@ -70,7 +70,7 @@ protected void initializeOp(Configuration hconf) throws HiveException { if (conf.getGenJoinKeys()) { int tagLen = conf.getTagLength(); joinKeys = new List[tagLen]; - JoinUtil.populateJoinKeyValue(joinKeys, conf.getKeys(), NOTSKIPBIGTABLE); + JoinUtil.populateJoinKeyValue(joinKeys, conf.getKeys(), NOTSKIPBIGTABLE, hconf); joinKeysObjectInspectors = JoinUtil.getObjectInspectorsFromEvaluators(joinKeys, inputObjInspectors,NOTSKIPBIGTABLE, tagLen); } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/CommonJoinOperator.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/CommonJoinOperator.java index 7e9007c..df1898e 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/CommonJoinOperator.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/CommonJoinOperator.java @@ -254,11 +254,11 @@ protected void initializeOp(Configuration hconf) throws HiveException { noOuterJoin = conf.isNoOuterJoin(); totalSz = JoinUtil.populateJoinKeyValue(joinValues, conf.getExprs(), - order,NOTSKIPBIGTABLE); + order,NOTSKIPBIGTABLE, hconf); //process join filters joinFilters = new List[tagLen]; - JoinUtil.populateJoinKeyValue(joinFilters, conf.getFilters(),order,NOTSKIPBIGTABLE); + JoinUtil.populateJoinKeyValue(joinFilters, conf.getFilters(),order,NOTSKIPBIGTABLE, hconf); joinValuesObjectInspectors = JoinUtil.getObjectInspectorsFromEvaluators(joinValues, diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/DynamicValueRegistry.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/DynamicValueRegistry.java new file mode 100644 index 0000000..63336bd --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/DynamicValueRegistry.java @@ -0,0 +1,30 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec; + +public interface DynamicValueRegistry { + + // Abstract class to hold info required for the implementation + public static abstract class RegistryConf { + } + + Object getValue(String key) throws Exception; + + void init(RegistryConf conf) throws Exception; +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/ExprNodeColumnEvaluator.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/ExprNodeColumnEvaluator.java index 24c8281..b0384df 100755 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/ExprNodeColumnEvaluator.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/ExprNodeColumnEvaluator.java @@ -18,6 +18,7 @@ package org.apache.hadoop.hive.ql.exec; +import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; @@ -38,8 +39,8 @@ private transient StructField[] fields; private transient boolean[] unionField; - public ExprNodeColumnEvaluator(ExprNodeColumnDesc expr) { - super(expr); + public ExprNodeColumnEvaluator(ExprNodeColumnDesc expr, Configuration conf) { + super(expr, conf); } @Override diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/ExprNodeConstantDefaultEvaluator.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/ExprNodeConstantDefaultEvaluator.java index 89a75eb..f53c3e3 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/ExprNodeConstantDefaultEvaluator.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/ExprNodeConstantDefaultEvaluator.java @@ -18,6 +18,7 @@ package org.apache.hadoop.hive.ql.exec; +import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDefaultDesc; import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc; @@ -33,7 +34,11 @@ transient ObjectInspector writableObjectInspector; public ExprNodeConstantDefaultEvaluator(ExprNodeConstantDefaultDesc expr) { - super(expr); + this(expr, null); + } + + public ExprNodeConstantDefaultEvaluator(ExprNodeConstantDefaultDesc expr, Configuration conf) { + super(expr, conf); writableObjectInspector = expr.getWritableObjectInspector(); } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/ExprNodeConstantEvaluator.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/ExprNodeConstantEvaluator.java index 4fe72a0..ca39e21 100755 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/ExprNodeConstantEvaluator.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/ExprNodeConstantEvaluator.java @@ -18,6 +18,7 @@ package org.apache.hadoop.hive.ql.exec; +import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc; import org.apache.hadoop.hive.serde2.objectinspector.ConstantObjectInspector; @@ -32,7 +33,11 @@ transient ConstantObjectInspector writableObjectInspector; public ExprNodeConstantEvaluator(ExprNodeConstantDesc expr) { - super(expr); + this(expr, null); + } + + public ExprNodeConstantEvaluator(ExprNodeConstantDesc expr, Configuration conf) { + super(expr, conf); writableObjectInspector = expr.getWritableObjectInspector(); } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/ExprNodeDynamicValueEvaluator.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/ExprNodeDynamicValueEvaluator.java new file mode 100644 index 0000000..6c68215 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/ExprNodeDynamicValueEvaluator.java @@ -0,0 +1,54 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.plan.DynamicValue; +import org.apache.hadoop.hive.ql.plan.ExprNodeDynamicValueDesc; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.ObjectInspectorCopyOption; + +/** + * ExprNodeDynamicEvaluator. + * + */ +public class ExprNodeDynamicValueEvaluator extends ExprNodeEvaluator { + + transient ObjectInspector oi; + + public ExprNodeDynamicValueEvaluator(ExprNodeDynamicValueDesc expr, Configuration conf) { + super(expr, conf); + oi = ObjectInspectorUtils.getStandardObjectInspector(expr.getWritableObjectInspector(), ObjectInspectorCopyOption.WRITABLE); + } + + @Override + public ObjectInspector initialize(ObjectInspector rowInspector) throws HiveException { + return oi; + } + + @Override + protected Object _evaluate(Object row, int version) throws HiveException { + DynamicValue dynamicValue = expr.getDynamicValue(); + dynamicValue.setConf(conf); + return dynamicValue.getWritableValue(); + } + +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/ExprNodeEvaluator.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/ExprNodeEvaluator.java index b8d6ab7..375d65f 100755 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/ExprNodeEvaluator.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/ExprNodeEvaluator.java @@ -18,6 +18,7 @@ package org.apache.hadoop.hive.ql.exec; +import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; @@ -30,9 +31,11 @@ protected final T expr; protected ObjectInspector outputOI; + protected Configuration conf; - public ExprNodeEvaluator(T expr) { + public ExprNodeEvaluator(T expr, Configuration conf) { this.expr = expr; + this.conf = conf; } /** @@ -109,4 +112,12 @@ public boolean isStateful() { public String toString() { return "ExprNodeEvaluator[" + expr + "]"; } + + public Configuration getConf() { + return conf; + } + + public void setConf(Configuration conf) { + this.conf = conf; + } } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/ExprNodeEvaluatorFactory.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/ExprNodeEvaluatorFactory.java index 0d03d8f..34aec55 100755 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/ExprNodeEvaluatorFactory.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/ExprNodeEvaluatorFactory.java @@ -21,11 +21,13 @@ import java.util.HashMap; import java.util.Map; +import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc; import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDefaultDesc; import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc; import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; +import org.apache.hadoop.hive.ql.plan.ExprNodeDynamicValueDesc; import org.apache.hadoop.hive.ql.plan.ExprNodeFieldDesc; import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc; @@ -39,9 +41,13 @@ private ExprNodeEvaluatorFactory() { } public static ExprNodeEvaluator get(ExprNodeDesc desc) throws HiveException { + return get(desc, null); + } + + public static ExprNodeEvaluator get(ExprNodeDesc desc, Configuration conf) throws HiveException { // Constant node if (desc instanceof ExprNodeConstantDesc) { - return new ExprNodeConstantEvaluator((ExprNodeConstantDesc) desc); + return new ExprNodeConstantEvaluator((ExprNodeConstantDesc) desc, conf); } // Special 'default' constant node @@ -51,15 +57,19 @@ public static ExprNodeEvaluator get(ExprNodeDesc desc) throws HiveException { // Column-reference node, e.g. a column in the input row if (desc instanceof ExprNodeColumnDesc) { - return new ExprNodeColumnEvaluator((ExprNodeColumnDesc) desc); + return new ExprNodeColumnEvaluator((ExprNodeColumnDesc) desc, conf); } // Generic Function node, e.g. CASE, an operator or a UDF node if (desc instanceof ExprNodeGenericFuncDesc) { - return new ExprNodeGenericFuncEvaluator((ExprNodeGenericFuncDesc) desc); + return new ExprNodeGenericFuncEvaluator((ExprNodeGenericFuncDesc) desc, conf); } // Field node, e.g. get a.myfield1 from a if (desc instanceof ExprNodeFieldDesc) { - return new ExprNodeFieldEvaluator((ExprNodeFieldDesc) desc); + return new ExprNodeFieldEvaluator((ExprNodeFieldDesc) desc, conf); + } + // Dynamic value which will be determined during query runtime + if (desc instanceof ExprNodeDynamicValueDesc) { + return new ExprNodeDynamicValueEvaluator((ExprNodeDynamicValueDesc) desc, conf); } throw new RuntimeException( "Cannot find ExprNodeEvaluator for the exprNodeDesc = " + desc); diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/ExprNodeEvaluatorHead.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/ExprNodeEvaluatorHead.java index 42685fb..991bc13 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/ExprNodeEvaluatorHead.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/ExprNodeEvaluatorHead.java @@ -30,7 +30,7 @@ private final ExprNodeEvaluator referencing; public ExprNodeEvaluatorHead(ExprNodeEvaluator referencing) { - super(referencing.getExpr()); + super(referencing.getExpr(), referencing.getConf()); this.referencing = referencing; } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/ExprNodeEvaluatorRef.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/ExprNodeEvaluatorRef.java index 0a6b66a..625d486 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/ExprNodeEvaluatorRef.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/ExprNodeEvaluatorRef.java @@ -30,7 +30,7 @@ private final ExprNodeEvaluator referencing; public ExprNodeEvaluatorRef(ExprNodeEvaluator referencing) { - super(referencing.getExpr()); + super(referencing.getExpr(), referencing.getConf()); this.referencing = referencing; } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/ExprNodeFieldEvaluator.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/ExprNodeFieldEvaluator.java index ff32626..1241343 100755 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/ExprNodeFieldEvaluator.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/ExprNodeFieldEvaluator.java @@ -21,6 +21,7 @@ import java.util.ArrayList; import java.util.List; +import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.plan.ExprNodeFieldDesc; import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector; @@ -43,9 +44,9 @@ transient ObjectInspector structFieldObjectInspector; transient ObjectInspector resultObjectInspector; - public ExprNodeFieldEvaluator(ExprNodeFieldDesc desc) throws HiveException { - super(desc); - leftEvaluator = ExprNodeEvaluatorFactory.get(desc.getDesc()); + public ExprNodeFieldEvaluator(ExprNodeFieldDesc desc, Configuration conf) throws HiveException { + super(desc, conf); + leftEvaluator = ExprNodeEvaluatorFactory.get(desc.getDesc(), conf); } @Override diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/ExprNodeGenericFuncEvaluator.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/ExprNodeGenericFuncEvaluator.java index 221abd9..8b9baa6 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/ExprNodeGenericFuncEvaluator.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/ExprNodeGenericFuncEvaluator.java @@ -20,6 +20,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc; @@ -91,13 +92,13 @@ public Object get() throws HiveException { } } - public ExprNodeGenericFuncEvaluator(ExprNodeGenericFuncDesc expr) throws HiveException { - super(expr); + public ExprNodeGenericFuncEvaluator(ExprNodeGenericFuncDesc expr, Configuration conf) throws HiveException { + super(expr, conf); children = new ExprNodeEvaluator[expr.getChildren().size()]; isEager = false; for (int i = 0; i < children.length; i++) { ExprNodeDesc child = expr.getChildren().get(i); - ExprNodeEvaluator nodeEvaluator = ExprNodeEvaluatorFactory.get(child); + ExprNodeEvaluator nodeEvaluator = ExprNodeEvaluatorFactory.get(child, conf); children[i] = nodeEvaluator; // If we have eager evaluators anywhere below us, then we are eager too. if (nodeEvaluator instanceof ExprNodeGenericFuncEvaluator) { diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/FilterOperator.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/FilterOperator.java index bd0d28c..df30ab2 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/FilterOperator.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/FilterOperator.java @@ -60,7 +60,7 @@ protected void initializeOp(Configuration hconf) throws HiveException { try { heartbeatInterval = HiveConf.getIntVar(hconf, HiveConf.ConfVars.HIVESENDHEARTBEAT); - conditionEvaluator = ExprNodeEvaluatorFactory.get(conf.getPredicate()); + conditionEvaluator = ExprNodeEvaluatorFactory.get(conf.getPredicate(), hconf); if (HiveConf.getBoolVar(hconf, HiveConf.ConfVars.HIVEEXPREVALUATIONCACHE)) { conditionEvaluator = ExprNodeEvaluatorFactory.toCachedEval(conditionEvaluator); } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java index 6f01da0..e75f5df 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java @@ -369,6 +369,7 @@ system.registerGenericUDF("not", GenericUDFOPNot.class); system.registerGenericUDF("!", GenericUDFOPNot.class); system.registerGenericUDF("between", GenericUDFBetween.class); + system.registerGenericUDF("in_bloom_filter", GenericUDFInBloomFilter.class); system.registerGenericUDF("ewah_bitmap_and", GenericUDFEWAHBitmapAnd.class); system.registerGenericUDF("ewah_bitmap_or", GenericUDFEWAHBitmapOr.class); @@ -426,7 +427,7 @@ system.registerGenericUDAF("ewah_bitmap", new GenericUDAFEWAHBitmap()); system.registerGenericUDAF("compute_stats", new GenericUDAFComputeStats()); - + system.registerGenericUDAF("bloom_filter", new GenericUDAFBloomFilter()); system.registerUDAF("percentile", UDAFPercentile.class); @@ -471,7 +472,6 @@ system.registerGenericUDF("to_unix_timestamp", GenericUDFToUnixTimeStamp.class); system.registerGenericUDF("internal_interval", GenericUDFInternalInterval.class); - // Generic UDTF's system.registerGenericUDTF("explode", GenericUDTFExplode.class); system.registerGenericUDTF("replicate_rows", GenericUDTFReplicateRows.class); diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/GroupByOperator.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/GroupByOperator.java index 46f0ecd..a3ca1cb 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/GroupByOperator.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/GroupByOperator.java @@ -212,7 +212,7 @@ protected void initializeOp(Configuration hconf) throws HiveException { keyObjectInspectors = new ObjectInspector[numKeys]; currentKeyObjectInspectors = new ObjectInspector[numKeys]; for (int i = 0; i < numKeys; i++) { - keyFields[i] = ExprNodeEvaluatorFactory.get(conf.getKeys().get(i)); + keyFields[i] = ExprNodeEvaluatorFactory.get(conf.getKeys().get(i), hconf); keyObjectInspectors[i] = keyFields[i].initialize(rowInspector); currentKeyObjectInspectors[i] = ObjectInspectorUtils .getStandardObjectInspector(keyObjectInspectors[i], @@ -258,7 +258,7 @@ protected void initializeOp(Configuration hconf) throws HiveException { new ExprNodeColumnDesc(TypeInfoUtils.getTypeInfoFromObjectInspector( sf.getFieldObjectInspector()), keyField.getFieldName() + "." + sf.getFieldName(), null, - false)); + false), hconf); unionExprEval.initialize(rowInspector); } } @@ -283,7 +283,7 @@ protected void initializeOp(Configuration hconf) throws HiveException { aggregationParameterObjects[i] = new Object[parameters.size()]; for (int j = 0; j < parameters.size(); j++) { aggregationParameterFields[i][j] = ExprNodeEvaluatorFactory - .get(parameters.get(j)); + .get(parameters.get(j), hconf); aggregationParameterObjectInspectors[i][j] = aggregationParameterFields[i][j] .initialize(rowInspector); if (unionExprEval != null) { @@ -352,6 +352,21 @@ protected void initializeOp(Configuration hconf) throws HiveException { } } + // grouping id should be pruned, which is the last of key columns + // see ColumnPrunerGroupByProc + outputKeyLength = conf.pruneGroupingSetId() ? keyFields.length - 1 : keyFields.length; + + // init objectInspectors + ObjectInspector[] objectInspectors = + new ObjectInspector[outputKeyLength + aggregationEvaluators.length]; + for (int i = 0; i < outputKeyLength; i++) { + objectInspectors[i] = currentKeyObjectInspectors[i]; + } + for (int i = 0; i < aggregationEvaluators.length; i++) { + objectInspectors[outputKeyLength + i] = aggregationEvaluators[i].init(conf.getAggregators() + .get(i).getMode(), aggregationParameterObjectInspectors[i]); + } + aggregationsParametersLastInvoke = new Object[conf.getAggregators().size()][]; if ((conf.getMode() != GroupByDesc.Mode.HASH || conf.getBucketGroup()) && (!groupingSetsPresent)) { @@ -374,21 +389,6 @@ protected void initializeOp(Configuration hconf) throws HiveException { List fieldNames = new ArrayList(conf.getOutputColumnNames()); - // grouping id should be pruned, which is the last of key columns - // see ColumnPrunerGroupByProc - outputKeyLength = conf.pruneGroupingSetId() ? keyFields.length - 1 : keyFields.length; - - // init objectInspectors - ObjectInspector[] objectInspectors = - new ObjectInspector[outputKeyLength + aggregationEvaluators.length]; - for (int i = 0; i < outputKeyLength; i++) { - objectInspectors[i] = currentKeyObjectInspectors[i]; - } - for (int i = 0; i < aggregationEvaluators.length; i++) { - objectInspectors[outputKeyLength + i] = aggregationEvaluators[i].init(conf.getAggregators() - .get(i).getMode(), aggregationParameterObjectInspectors[i]); - } - outputObjInspector = ObjectInspectorFactory .getStandardStructObjectInspector(fieldNames, Arrays.asList(objectInspectors)); diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/HashTableSinkOperator.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/HashTableSinkOperator.java index ac5331e..3a366f6 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/HashTableSinkOperator.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/HashTableSinkOperator.java @@ -143,19 +143,19 @@ protected void initializeOp(Configuration hconf) throws HiveException { // process join keys joinKeys = new List[tagLen]; - JoinUtil.populateJoinKeyValue(joinKeys, conf.getKeys(), posBigTableAlias); + JoinUtil.populateJoinKeyValue(joinKeys, conf.getKeys(), posBigTableAlias, hconf); joinKeysObjectInspectors = JoinUtil.getObjectInspectorsFromEvaluators(joinKeys, inputObjInspectors, posBigTableAlias, tagLen); // process join values joinValues = new List[tagLen]; - JoinUtil.populateJoinKeyValue(joinValues, conf.getExprs(), posBigTableAlias); + JoinUtil.populateJoinKeyValue(joinValues, conf.getExprs(), posBigTableAlias, hconf); joinValuesObjectInspectors = JoinUtil.getObjectInspectorsFromEvaluators(joinValues, inputObjInspectors, posBigTableAlias, tagLen); // process join filters joinFilters = new List[tagLen]; - JoinUtil.populateJoinKeyValue(joinFilters, conf.getFilters(), posBigTableAlias); + JoinUtil.populateJoinKeyValue(joinFilters, conf.getFilters(), posBigTableAlias, hconf); joinFilterObjectInspectors = JoinUtil.getObjectInspectorsFromEvaluators(joinFilters, inputObjInspectors, posBigTableAlias, tagLen); diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/JoinUtil.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/JoinUtil.java index 9718c48..07a3dc6 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/JoinUtil.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/JoinUtil.java @@ -121,14 +121,14 @@ } public static int populateJoinKeyValue(List[] outMap, - Map> inputMap, int posBigTableAlias) throws HiveException { - return populateJoinKeyValue(outMap, inputMap, null, posBigTableAlias); + Map> inputMap, int posBigTableAlias, Configuration conf) throws HiveException { + return populateJoinKeyValue(outMap, inputMap, null, posBigTableAlias, conf); } public static int populateJoinKeyValue(List[] outMap, Map> inputMap, Byte[] order, - int posBigTableAlias) throws HiveException { + int posBigTableAlias, Configuration conf) throws HiveException { int total = 0; for (Entry> e : inputMap.entrySet()) { if (e.getValue() == null) { @@ -140,7 +140,7 @@ public static int populateJoinKeyValue(List[] outMap, if (key == (byte) posBigTableAlias) { valueFields.add(null); } else { - valueFields.add(ExprNodeEvaluatorFactory.get(expr)); + valueFields.add(ExprNodeEvaluatorFactory.get(expr, conf)); } } outMap[key] = valueFields; diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/ObjectCache.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/ObjectCache.java index 440e0a1..b931c95 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/ObjectCache.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/ObjectCache.java @@ -44,6 +44,16 @@ public T retrieve(String key, Callable fn) throws HiveException; /** + * Retrieve object from cache. + * + * @param + * @param key + * function to generate the object if it's not there + * @return the last cached object with the key, null if none. + */ + public T retrieve(String key) throws HiveException; + + /** * Retrieve object from cache asynchronously. * * @param diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/ObjectCacheWrapper.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/ObjectCacheWrapper.java index 9768efa..71bcd98 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/ObjectCacheWrapper.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/ObjectCacheWrapper.java @@ -36,6 +36,11 @@ public void release(String key) { } @Override + public T retrieve(String key) throws HiveException { + return globalCache.retrieve(makeKey(key)); + } + + @Override public T retrieve(String key, Callable fn) throws HiveException { return globalCache.retrieve(makeKey(key), fn); } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/SelectOperator.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/SelectOperator.java index 9049ddd..a30c771 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/SelectOperator.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/SelectOperator.java @@ -63,7 +63,7 @@ protected void initializeOp(Configuration hconf) throws HiveException { eval = new ExprNodeEvaluator[colList.size()]; for (int i = 0; i < colList.size(); i++) { assert (colList.get(i) != null); - eval[i] = ExprNodeEvaluatorFactory.get(colList.get(i)); + eval[i] = ExprNodeEvaluatorFactory.get(colList.get(i), hconf); } if (HiveConf.getBoolVar(hconf, HiveConf.ConfVars.HIVEEXPREVALUATIONCACHE)) { eval = ExprNodeEvaluatorFactory.toCachedEvals(eval); diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/mr/ObjectCache.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/mr/ObjectCache.java index 008f8a4..cfe1750 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/mr/ObjectCache.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/mr/ObjectCache.java @@ -47,6 +47,11 @@ public void release(String key) { } @Override + public T retrieve(String key) throws HiveException { + return retrieve(key, null); + } + + @Override public T retrieve(String key, Callable fn) throws HiveException { try { if (isDebugEnabled) { diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/DynamicValueRegistryTez.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/DynamicValueRegistryTez.java new file mode 100644 index 0000000..7bbedf6 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/DynamicValueRegistryTez.java @@ -0,0 +1,131 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.tez; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hive.ql.exec.ExprNodeEvaluator; +import org.apache.hadoop.hive.ql.exec.ExprNodeEvaluatorFactory; +import org.apache.hadoop.hive.ql.exec.DynamicValueRegistry; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.parse.RuntimeValuesInfo; +import org.apache.hadoop.hive.ql.plan.BaseWork; +import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; +import org.apache.hadoop.hive.ql.plan.TableDesc; +import org.apache.hadoop.hive.serde2.Deserializer; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; +import org.apache.hadoop.io.Writable; +import org.apache.hadoop.util.ReflectionUtils; +import org.apache.tez.runtime.api.Input; +import org.apache.tez.runtime.api.LogicalInput; +import org.apache.tez.runtime.api.ProcessorContext; +import org.apache.tez.runtime.library.api.KeyValueReader; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class DynamicValueRegistryTez implements DynamicValueRegistry { + private static final Logger LOG = LoggerFactory.getLogger(DynamicValueRegistryTez.class); + + public static class RegistryConfTez extends RegistryConf { + public Configuration conf; + public BaseWork baseWork; + public ProcessorContext processorContext; + public Map inputs; + + public RegistryConfTez(Configuration conf, BaseWork baseWork, + ProcessorContext processorContext, Map inputs) { + super(); + this.conf = conf; + this.baseWork = baseWork; + this.processorContext = processorContext; + this.inputs = inputs; + } + } + + protected Map values = Collections.synchronizedMap(new HashMap()); + + public DynamicValueRegistryTez() { + } + + @Override + public Object getValue(String key) { + if (!values.containsKey(key)) { + throw new IllegalStateException("Value does not exist in registry: " + key); + } + return values.get(key); + } + + protected void setValue(String key, Object value) { + values.put(key, value); + } + + @Override + public void init(RegistryConf conf) throws Exception { + RegistryConfTez rct = (RegistryConfTez) conf; + + for (String inputSourceName : rct.baseWork.getInputSourceToRuntimeValuesInfo().keySet()) { + LOG.info("Runtime value source: " + inputSourceName); + + LogicalInput runtimeValueInput = rct.inputs.get(inputSourceName); + RuntimeValuesInfo runtimeValuesInfo = rct.baseWork.getInputSourceToRuntimeValuesInfo().get(inputSourceName); + + // Setup deserializer/obj inspectors for the incoming data source + Deserializer deserializer = ReflectionUtils.newInstance(runtimeValuesInfo.getTableDesc().getDeserializerClass(), null); + deserializer.initialize(rct.conf, runtimeValuesInfo.getTableDesc().getProperties()); + ObjectInspector inspector = deserializer.getObjectInspector(); + + // Set up col expressions for the dynamic values using this input + List colExprEvaluators = new ArrayList(); + for (ExprNodeDesc expr : runtimeValuesInfo.getColExprs()) { + ExprNodeEvaluator exprEval = ExprNodeEvaluatorFactory.get(expr, null); + exprEval.initialize(inspector); + colExprEvaluators.add(exprEval); + } + + runtimeValueInput.start(); + List inputList = new ArrayList(); + inputList.add(runtimeValueInput); + rct.processorContext.waitForAllInputsReady(inputList); + + KeyValueReader kvReader = (KeyValueReader) runtimeValueInput.getReader(); + long rowCount = 0; + while (kvReader.next()) { + Object row = deserializer.deserialize((Writable) kvReader.getCurrentValue()); + rowCount++; + for (int colIdx = 0; colIdx < colExprEvaluators.size(); ++colIdx) { + // Read each expression and save it to the value registry + ExprNodeEvaluator eval = colExprEvaluators.get(colIdx); + Object val = eval.evaluate(row); + setValue(runtimeValuesInfo.getDynamicValueIDs().get(colIdx), val); + } + } + // For now, expecting a single row (min/max, aggregated bloom filter) + if (rowCount != 1) { + throw new IllegalStateException("Expected 1 row from " + inputSourceName + ", got " + rowCount); + } + } + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/LlapObjectCache.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/LlapObjectCache.java index 0141230..1ce8ee9 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/LlapObjectCache.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/LlapObjectCache.java @@ -60,6 +60,24 @@ public void release(String key) { @SuppressWarnings("unchecked") @Override + public T retrieve(String key) throws HiveException { + + T value = null; + + lock.lock(); + try { + value = (T) registry.getIfPresent(key); + if (value != null && isLogDebugEnabled) { + LOG.debug("Found " + key + " in cache"); + } + return value; + } finally { + lock.unlock(); + } + } + + @SuppressWarnings("unchecked") + @Override public T retrieve(String key, Callable fn) throws HiveException { T value = null; diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/MapRecordProcessor.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/MapRecordProcessor.java index 955fa80..790c9d8 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/MapRecordProcessor.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/MapRecordProcessor.java @@ -51,11 +51,13 @@ import org.apache.hadoop.hive.ql.exec.Utilities; import org.apache.hadoop.hive.ql.exec.mr.ExecMapper.ReportStats; import org.apache.hadoop.hive.ql.exec.mr.ExecMapperContext; +import org.apache.hadoop.hive.ql.exec.tez.DynamicValueRegistryTez.RegistryConfTez; import org.apache.hadoop.hive.ql.exec.tez.TezProcessor.TezKVOutputCollector; import org.apache.hadoop.hive.ql.exec.tez.tools.KeyValueInputMerger; import org.apache.hadoop.hive.ql.exec.vector.VectorMapOperator; import org.apache.hadoop.hive.ql.log.PerfLogger; import org.apache.hadoop.hive.ql.plan.BaseWork; +import org.apache.hadoop.hive.ql.plan.DynamicValue; import org.apache.hadoop.hive.ql.plan.MapWork; import org.apache.hadoop.hive.ql.plan.OperatorDesc; import org.apache.hadoop.hive.serde2.Deserializer; @@ -88,8 +90,8 @@ private final ExecMapperContext execContext; private MapWork mapWork; List mergeWorkList; - List cacheKeys; - ObjectCache cache; + List cacheKeys, dynamicValueCacheKeys; + ObjectCache cache, dynamicValueCache; private int nRows; public MapRecordProcessor(final JobConf jconf, final ProcessorContext context) throws Exception { @@ -99,9 +101,11 @@ public MapRecordProcessor(final JobConf jconf, final ProcessorContext context) t setLlapOfFragmentId(context); } cache = ObjectCacheFactory.getCache(jconf, queryId, true); + dynamicValueCache = ObjectCacheFactory.getCache(jconf, queryId, false); execContext = new ExecMapperContext(jconf); execContext.setJc(jconf); cacheKeys = new ArrayList(); + dynamicValueCacheKeys = new ArrayList(); nRows = 0; } @@ -295,6 +299,21 @@ public Object call() { mapOp.initializeLocalWork(jconf); + // Setup values registry + checkAbortCondition(); + String valueRegistryKey = DynamicValue.DYNAMIC_VALUE_REGISTRY_CACHE_KEY; + // On LLAP dynamic value registry might already be cached. + final DynamicValueRegistryTez registryTez = dynamicValueCache.retrieve(valueRegistryKey, + new Callable() { + @Override + public DynamicValueRegistryTez call() { + return new DynamicValueRegistryTez(); + } + }); + dynamicValueCacheKeys.add(valueRegistryKey); + RegistryConfTez registryConf = new RegistryConfTez(jconf, mapWork, processorContext, inputs); + registryTez.init(registryConf); + checkAbortCondition(); initializeMapRecordSources(); mapOp.initializeMapOperator(jconf); @@ -435,6 +454,12 @@ void close(){ } } + if (dynamicValueCache != null && dynamicValueCacheKeys != null) { + for (String k: dynamicValueCacheKeys) { + dynamicValueCache.release(k); + } + } + // detecting failed executions by exceptions thrown by the operator tree try { if (mapOp == null || mapWork == null) { diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/ObjectCache.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/ObjectCache.java index 06dca00..72dcdd3 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/ObjectCache.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/ObjectCache.java @@ -65,6 +65,22 @@ public void release(String key) { LOG.info("Releasing key: " + key); } + + @SuppressWarnings("unchecked") + @Override + public T retrieve(String key) throws HiveException { + T value = null; + try { + value = (T) registry.get(key); + if ( value != null) { + LOG.info("Found " + key + " in cache with value: " + value); + } + } catch (Exception e) { + throw new HiveException(e); + } + return value; + } + @SuppressWarnings("unchecked") @Override public T retrieve(String key, Callable fn) throws HiveException { diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/ReduceRecordProcessor.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/ReduceRecordProcessor.java index d80f201..2d06545 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/ReduceRecordProcessor.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/ReduceRecordProcessor.java @@ -40,9 +40,11 @@ import org.apache.hadoop.hive.ql.exec.OperatorUtils; import org.apache.hadoop.hive.ql.exec.Utilities; import org.apache.hadoop.hive.ql.exec.mr.ExecMapper.ReportStats; +import org.apache.hadoop.hive.ql.exec.tez.DynamicValueRegistryTez.RegistryConfTez; import org.apache.hadoop.hive.ql.exec.tez.TezProcessor.TezKVOutputCollector; import org.apache.hadoop.hive.ql.log.PerfLogger; import org.apache.hadoop.hive.ql.plan.BaseWork; +import org.apache.hadoop.hive.ql.plan.DynamicValue; import org.apache.hadoop.hive.ql.plan.ReduceWork; import org.apache.hadoop.hive.ql.plan.TableDesc; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; @@ -64,14 +66,14 @@ private static final String REDUCE_PLAN_KEY = "__REDUCE_PLAN__"; - private ObjectCache cache; + private ObjectCache cache, dynamicValueCache; public static final Logger l4j = LoggerFactory.getLogger(ReduceRecordProcessor.class); private ReduceWork reduceWork; List mergeWorkList = null; - List cacheKeys; + List cacheKeys, dynamicValueCacheKeys; private final Map connectOps = new TreeMap(); @@ -91,9 +93,11 @@ public ReduceRecordProcessor(final JobConf jconf, final ProcessorContext context String queryId = HiveConf.getVar(jconf, HiveConf.ConfVars.HIVEQUERYID); cache = ObjectCacheFactory.getCache(jconf, queryId, true); + dynamicValueCache = ObjectCacheFactory.getCache(jconf, queryId, false); String cacheKey = processorContext.getTaskVertexName() + REDUCE_PLAN_KEY; cacheKeys = Lists.newArrayList(cacheKey); + dynamicValueCacheKeys = new ArrayList(); reduceWork = (ReduceWork) cache.retrieve(cacheKey, new Callable() { @Override public Object call() { @@ -169,6 +173,21 @@ void init( l4j.info("Memory available for operators set to {}", LlapUtil.humanReadableByteCount(memoryAvailableToTask)); } OperatorUtils.setMemoryAvailable(reducer.getChildOperators(), memoryAvailableToTask); + + // Setup values registry + String valueRegistryKey = DynamicValue.DYNAMIC_VALUE_REGISTRY_CACHE_KEY; + DynamicValueRegistryTez registryTez = dynamicValueCache.retrieve(valueRegistryKey, + new Callable() { + @Override + public DynamicValueRegistryTez call() { + return new DynamicValueRegistryTez(); + } + }); + dynamicValueCacheKeys.add(valueRegistryKey); + RegistryConfTez registryConf = new RegistryConfTez(jconf, reduceWork, processorContext, inputs); + registryTez.init(registryConf); + checkAbortCondition(); + if (numTags > 1) { sources = new ReduceRecordSource[numTags]; mainWorkOIs = new ObjectInspector[numTags]; @@ -348,6 +367,12 @@ void close(){ } } + if (dynamicValueCache != null && dynamicValueCacheKeys != null) { + for (String k: dynamicValueCacheKeys) { + dynamicValueCache.release(k); + } + } + try { for (ReduceRecordSource rs: sources) { abort = abort && rs.close(); diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorMapJoinOperator.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorMapJoinOperator.java index 0cb6c8a..848fc8e 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorMapJoinOperator.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorMapJoinOperator.java @@ -153,7 +153,7 @@ public void assign(VectorExpressionWriter[] writers, List oids) VectorExpression vectorExpr = bigTableValueExpressions[i]; // This is a vectorized aware evaluator - ExprNodeEvaluator eval = new ExprNodeEvaluator(desc) { + ExprNodeEvaluator eval = new ExprNodeEvaluator(desc, hconf) { int columnIndex; int writerIndex; diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorSMBMapJoinOperator.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorSMBMapJoinOperator.java index 80b0a14..ac3363e 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorSMBMapJoinOperator.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorSMBMapJoinOperator.java @@ -207,7 +207,7 @@ public void assign(VectorExpressionWriter[] writers, List oids) VectorExpression vectorExpr = bigTableValueExpressions[i]; // This is a vectorized aware evaluator - ExprNodeEvaluator eval = new ExprNodeEvaluator(desc) { + ExprNodeEvaluator eval = new ExprNodeEvaluator(desc, hconf) { int columnIndex;; int writerIndex; diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/sarg/ConvertAstToSearchArg.java b/ql/src/java/org/apache/hadoop/hive/ql/io/sarg/ConvertAstToSearchArg.java index 9d900e4..997334b 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/sarg/ConvertAstToSearchArg.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/sarg/ConvertAstToSearchArg.java @@ -27,9 +27,11 @@ import org.apache.hadoop.hive.common.type.HiveChar; import org.apache.hadoop.hive.common.type.HiveDecimal; import org.apache.hadoop.hive.ql.exec.SerializationUtilities; +import org.apache.hadoop.hive.ql.io.sarg.LiteralDelegate; import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc; import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc; import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; +import org.apache.hadoop.hive.ql.plan.ExprNodeDynamicValueDesc; import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc; import org.apache.hadoop.hive.ql.plan.TableScanDesc; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFBetween; @@ -58,14 +60,16 @@ public class ConvertAstToSearchArg { private static final Logger LOG = LoggerFactory.getLogger(ConvertAstToSearchArg.class); - private final SearchArgument.Builder builder = - SearchArgumentFactory.newBuilder(); + private final SearchArgument.Builder builder; + private final Configuration conf; /** * Builds the expression and leaf list from the original predicate. * @param expression the expression to translate. */ - ConvertAstToSearchArg(ExprNodeGenericFuncDesc expression) { + ConvertAstToSearchArg(Configuration conf, ExprNodeGenericFuncDesc expression) { + this.conf = conf; + builder = SearchArgumentFactory.newBuilder(conf); parse(expression); } @@ -182,7 +186,7 @@ private static Object boxLiteral(ExprNodeConstantDesc constantDesc, * @param type the type of the expression * @return the literal boxed if found or null */ - private static Object findLiteral(ExprNodeGenericFuncDesc expr, + private static Object findLiteral(Configuration conf, ExprNodeGenericFuncDesc expr, PredicateLeaf.Type type) { List children = expr.getChildren(); if (children.size() != 2) { @@ -190,16 +194,29 @@ private static Object findLiteral(ExprNodeGenericFuncDesc expr, } Object result = null; for(ExprNodeDesc child: children) { - if (child instanceof ExprNodeConstantDesc) { + Object currentResult = getLiteral(conf, child, type); + if (currentResult != null) { + // Both children in the expression should not be literal if (result != null) { return null; } - result = boxLiteral((ExprNodeConstantDesc) child, type); + result = currentResult; } } return result; } + private static Object getLiteral(Configuration conf, ExprNodeDesc child, PredicateLeaf.Type type) { + if (child instanceof ExprNodeConstantDesc) { + return boxLiteral((ExprNodeConstantDesc) child, type); + } else if (child instanceof ExprNodeDynamicValueDesc) { + LiteralDelegate value = ((ExprNodeDynamicValueDesc) child).getDynamicValue(); + value.setConf(conf); + return value; + } + return null; + } + /** * Return the boxed literal at the given position * @param expr the parent node @@ -207,15 +224,12 @@ private static Object findLiteral(ExprNodeGenericFuncDesc expr, * @param position the child position to check * @return the boxed literal if found otherwise null */ - private static Object getLiteral(ExprNodeGenericFuncDesc expr, + private static Object getLiteral(Configuration conf, ExprNodeGenericFuncDesc expr, PredicateLeaf.Type type, int position) { List children = expr.getChildren(); - Object child = children.get(position); - if (child instanceof ExprNodeConstantDesc) { - return boxLiteral((ExprNodeConstantDesc) child, type); - } - return null; + ExprNodeDesc child = children.get(position); + return getLiteral(conf, child, type); } private static Object[] getLiteralList(ExprNodeGenericFuncDesc expr, @@ -272,16 +286,16 @@ private void createLeaf(PredicateLeaf.Operator operator, builder.isNull(columnName, type); break; case EQUALS: - builder.equals(columnName, type, findLiteral(expression, type)); + builder.equals(columnName, type, findLiteral(conf, expression, type)); break; case NULL_SAFE_EQUALS: - builder.nullSafeEquals(columnName, type, findLiteral(expression, type)); + builder.nullSafeEquals(columnName, type, findLiteral(conf, expression, type)); break; case LESS_THAN: - builder.lessThan(columnName, type, findLiteral(expression, type)); + builder.lessThan(columnName, type, findLiteral(conf, expression, type)); break; case LESS_THAN_EQUALS: - builder.lessThanEquals(columnName, type, findLiteral(expression, type)); + builder.lessThanEquals(columnName, type, findLiteral(conf, expression, type)); break; case IN: builder.in(columnName, type, @@ -289,8 +303,8 @@ private void createLeaf(PredicateLeaf.Operator operator, break; case BETWEEN: builder.between(columnName, type, - getLiteral(expression, type, variable + 1), - getLiteral(expression, type, variable + 2)); + getLiteral(conf, expression, type, variable + 1), + getLiteral(conf, expression, type, variable + 2)); break; } } catch (Exception e) { @@ -425,8 +439,8 @@ private void parse(ExprNodeDesc expression) { public static final String SARG_PUSHDOWN = "sarg.pushdown"; - public static SearchArgument create(ExprNodeGenericFuncDesc expression) { - return new ConvertAstToSearchArg(expression).buildSearchArgument(); + public static SearchArgument create(Configuration conf, ExprNodeGenericFuncDesc expression) { + return new ConvertAstToSearchArg(conf, expression).buildSearchArgument(); } @@ -445,7 +459,7 @@ public static SearchArgument create(byte[] kryoBytes) { public static SearchArgument createFromConf(Configuration conf) { String sargString; if ((sargString = conf.get(TableScanDesc.FILTER_EXPR_CONF_STR)) != null) { - return create(SerializationUtilities.deserializeExpression(sargString)); + return create(conf, SerializationUtilities.deserializeExpression(sargString)); } else if ((sargString = conf.get(SARG_PUSHDOWN)) != null) { return create(sargString); } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/DynamicPartitionPruningOptimization.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/DynamicPartitionPruningOptimization.java index 26fcc45..3ab8a1c 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/DynamicPartitionPruningOptimization.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/DynamicPartitionPruningOptimization.java @@ -28,13 +28,8 @@ import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.conf.HiveConf.ConfVars; -import org.apache.hadoop.hive.ql.exec.FilterOperator; -import org.apache.hadoop.hive.ql.exec.GroupByOperator; -import org.apache.hadoop.hive.ql.exec.Operator; -import org.apache.hadoop.hive.ql.exec.OperatorFactory; -import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator; -import org.apache.hadoop.hive.ql.exec.SelectOperator; -import org.apache.hadoop.hive.ql.exec.TableScanOperator; +import org.apache.hadoop.hive.ql.exec.*; +import org.apache.hadoop.hive.ql.io.AcidUtils.Operation; import org.apache.hadoop.hive.ql.lib.DefaultGraphWalker; import org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher; import org.apache.hadoop.hive.ql.lib.Dispatcher; @@ -50,20 +45,19 @@ import org.apache.hadoop.hive.ql.parse.OptimizeTezProcContext; import org.apache.hadoop.hive.ql.parse.ParseContext; import org.apache.hadoop.hive.ql.parse.PrunedPartitionList; +import org.apache.hadoop.hive.ql.parse.RuntimeValuesInfo; +import org.apache.hadoop.hive.ql.parse.SemanticAnalyzer; import org.apache.hadoop.hive.ql.parse.SemanticException; import org.apache.hadoop.hive.ql.parse.spark.OptimizeSparkProcContext; -import org.apache.hadoop.hive.ql.plan.AggregationDesc; -import org.apache.hadoop.hive.ql.plan.DynamicPruningEventDesc; -import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc; -import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc; -import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; -import org.apache.hadoop.hive.ql.plan.ExprNodeDescUtils; -import org.apache.hadoop.hive.ql.plan.ExprNodeDynamicListDesc; -import org.apache.hadoop.hive.ql.plan.FilterDesc; -import org.apache.hadoop.hive.ql.plan.GroupByDesc; -import org.apache.hadoop.hive.ql.plan.OperatorDesc; -import org.apache.hadoop.hive.ql.plan.PlanUtils; -import org.apache.hadoop.hive.ql.plan.SelectDesc; +import org.apache.hadoop.hive.ql.plan.*; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFBloomFilter.GenericUDAFBloomFilterEvaluator; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator.Mode; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; +import org.apache.hadoop.yarn.api.protocolrecords.GetNewApplicationRequest; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -148,15 +142,13 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, Obje FilterOperator filter = (FilterOperator) nd; FilterDesc desc = filter.getConf(); - TableScanOperator ts = null; - if (!parseContext.getConf().getBoolVar(ConfVars.TEZ_DYNAMIC_PARTITION_PRUNING) && !parseContext.getConf().getBoolVar(ConfVars.SPARK_DYNAMIC_PARTITION_PRUNING)) { // nothing to do when the optimization is off return null; } - DynamicPartitionPrunerContext removerContext = new DynamicPartitionPrunerContext(); + TableScanOperator ts = null; if (filter.getParentOperators().size() == 1 && filter.getParentOperators().get(0) instanceof TableScanOperator) { @@ -169,14 +161,32 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, Obje LOG.debug("TableScan: " + ts); } + DynamicPartitionPrunerContext removerContext = new DynamicPartitionPrunerContext(); + // collect the dynamic pruning conditions removerContext.dynLists.clear(); collectDynamicPruningConditions(desc.getPredicate(), removerContext); + if (ts == null) { + // Replace the synthetic predicate with true and bail out + for (DynamicListContext ctx : removerContext) { + ExprNodeDesc constNode = + new ExprNodeConstantDesc(ctx.parent.getTypeInfo(), true); + replaceExprNode(ctx, desc, constNode); + } + return false; + } + + final boolean semiJoin = parseContext.getConf().getBoolVar(ConfVars.TEZ_DYNAMIC_SEMIJOIN_REDUCTION); + for (DynamicListContext ctx : removerContext) { String column = ExprNodeDescUtils.extractColName(ctx.parent); + boolean semiJoinAttempted = false; + + if (column != null) { + // Need unique IDs to refer to each min/max key value in the DynamicValueRegistry + String keyBaseAlias = ""; - if (ts != null && column != null) { Table table = ts.getConf().getTableMetadata(); if (table != null && table.isPartitionKey(column)) { @@ -203,20 +213,56 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, Obje } } else { LOG.debug("Column " + column + " is not a partition column"); + if (semiJoin && ts.getConf().getFilterExpr() != null) { + LOG.debug("Initiate semijoin reduction for " + column); + // Get the table name from which the min-max values will come. + Operator op = ctx.generator; + while (!(op == null || op instanceof TableScanOperator)) { + op = op.getParentOperators().get(0); + } + String tableAlias = (op == null ? "" : ((TableScanOperator) op).getConf().getAlias()); + keyBaseAlias = ctx.generator.getOperatorId() + "_" + tableAlias + "_" + column; + + semiJoinAttempted = generateSemiJoinOperatorPlan(ctx, parseContext, ts, keyBaseAlias); + } } - } - // we always remove the condition by replacing it with "true" - ExprNodeDesc constNode = new ExprNodeConstantDesc(ctx.parent.getTypeInfo(), true); - if (ctx.grandParent == null) { - desc.setPredicate(constNode); + // If semijoin is attempted then replace the condition with a min-max filter + // and bloom filter else, + // we always remove the condition by replacing it with "true" + if (semiJoinAttempted) { + List betweenArgs = new ArrayList(); + betweenArgs.add(new ExprNodeConstantDesc(Boolean.FALSE)); // Do not invert between result + // add column expression here + betweenArgs.add(ctx.parent.getChildren().get(0)); + betweenArgs.add(new ExprNodeDynamicValueDesc(new DynamicValue(keyBaseAlias + "_min", ctx.desc.getTypeInfo()))); + betweenArgs.add(new ExprNodeDynamicValueDesc(new DynamicValue(keyBaseAlias + "_max", ctx.desc.getTypeInfo()))); + ExprNodeDesc betweenNode = ExprNodeGenericFuncDesc.newInstance( + FunctionRegistry.getFunctionInfo("between").getGenericUDF(), betweenArgs); + replaceExprNode(ctx, desc, betweenNode); + // add column expression for bloom filter + List bloomFilterArgs = new ArrayList(); + bloomFilterArgs.add(ctx.parent.getChildren().get(0)); + bloomFilterArgs.add(new ExprNodeDynamicValueDesc( + new DynamicValue(keyBaseAlias + "_bloom_filter", + TypeInfoFactory.binaryTypeInfo))); + ExprNodeDesc bloomFilterNode = ExprNodeGenericFuncDesc.newInstance( + FunctionRegistry.getFunctionInfo("in_bloom_filter"). + getGenericUDF(), bloomFilterArgs); + // ctx may not have the grandparent but it is set in filterDesc by now. + ExprNodeDesc grandParent = ctx.grandParent == null ? + desc.getPredicate() : ctx.grandParent; + grandParent.getChildren().add(bloomFilterNode); + } else { + ExprNodeDesc replaceNode = new ExprNodeConstantDesc(ctx.parent.getTypeInfo(), true); + replaceExprNode(ctx, desc, replaceNode); + } } else { - int i = ctx.grandParent.getChildren().indexOf(ctx.parent); - ctx.grandParent.getChildren().remove(i); - ctx.grandParent.getChildren().add(i, constNode); + ExprNodeDesc constNode = + new ExprNodeConstantDesc(ctx.parent.getTypeInfo(), true); + replaceExprNode(ctx, desc, constNode); } } - // if we pushed the predicate into the table scan we need to remove the // synthetic conditions there. cleanTableScanFilters(ts); @@ -224,6 +270,16 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, Obje return false; } + private void replaceExprNode(DynamicListContext ctx, FilterDesc desc, ExprNodeDesc node) { + if (ctx.grandParent == null) { + desc.setPredicate(node); + } else { + int i = ctx.grandParent.getChildren().indexOf(ctx.parent); + ctx.grandParent.getChildren().remove(i); + ctx.grandParent.getChildren().add(i, node); + } + } + private void cleanTableScanFilters(TableScanOperator ts) throws SemanticException { if (ts == null || ts.getConf() == null || ts.getConf().getFilterExpr() == null) { @@ -327,6 +383,226 @@ private void generateEventOperatorPlan(DynamicListContext ctx, ParseContext pars } } + // Generates plan for min/max when dynamic partition pruning is ruled out. + private boolean generateSemiJoinOperatorPlan(DynamicListContext ctx, ParseContext parseContext, + TableScanOperator ts, String keyBaseAlias) throws SemanticException { + + // we will put a fork in the plan at the source of the reduce sink + Operator parentOfRS = ctx.generator.getParentOperators().get(0); + + // we need the expr that generated the key of the reduce sink + ExprNodeDesc key = ctx.generator.getConf().getKeyCols().get(ctx.desc.getKeyIndex()); + + if (parentOfRS instanceof SelectOperator) { + // Make sure the semijoin branch is not on parition column. + String internalColName = null; + ExprNodeDesc exprNodeDesc = key; + // Find the ExprNodeColumnDesc + while (!(exprNodeDesc instanceof ExprNodeColumnDesc)) { + exprNodeDesc = exprNodeDesc.getChildren().get(0); + } + internalColName = ((ExprNodeColumnDesc) exprNodeDesc).getColumn(); + + ExprNodeColumnDesc colExpr = ((ExprNodeColumnDesc)(parentOfRS. + getColumnExprMap().get(internalColName))); + String colName = ExprNodeDescUtils.extractColName(colExpr); + + // Fetch the TableScan Operator. + Operator op = parentOfRS.getParentOperators().get(0); + while (op != null && !(op instanceof TableScanOperator)) { + op = op.getParentOperators().get(0); + } + assert op != null; + + Table table = ((TableScanOperator) op).getConf().getTableMetadata(); + if (table.isPartitionKey(colName)) { + // The column is partition column, skip the optimization. + return false; + } + } + List keyExprs = new ArrayList(); + keyExprs.add(key); + + // group by requires "ArrayList", don't ask. + ArrayList outputNames = new ArrayList(); + outputNames.add(HiveConf.getColumnInternalName(0)); + + // project the relevant key column + SelectDesc select = new SelectDesc(keyExprs, outputNames); + SelectOperator selectOp = + (SelectOperator) OperatorFactory.getAndMakeChild(select, + new RowSchema(parentOfRS.getSchema()), parentOfRS); + + // do a group by to aggregate min,max and bloom filter. + float groupByMemoryUsage = + HiveConf.getFloatVar(parseContext.getConf(), HiveConf.ConfVars.HIVEMAPAGGRHASHMEMORY); + float memoryThreshold = + HiveConf.getFloatVar(parseContext.getConf(), + HiveConf.ConfVars.HIVEMAPAGGRMEMORYTHRESHOLD); + + ArrayList groupByExprs = new ArrayList(); + + // Add min/max and bloom filter aggregations + List aggFnOIs = new ArrayList(); + aggFnOIs.add(key.getWritableObjectInspector()); + ArrayList params = new ArrayList(); + params.add( + new ExprNodeColumnDesc(key.getTypeInfo(), outputNames.get(0), + "", false)); + + ArrayList aggs = new ArrayList(); + try { + AggregationDesc min = new AggregationDesc("min", + FunctionRegistry.getGenericUDAFEvaluator("min", aggFnOIs, false, false), + params, false, Mode.PARTIAL1); + AggregationDesc max = new AggregationDesc("max", + FunctionRegistry.getGenericUDAFEvaluator("max", aggFnOIs, false, false), + params, false, Mode.PARTIAL1); + AggregationDesc bloomFilter = new AggregationDesc("bloom_filter", + FunctionRegistry.getGenericUDAFEvaluator("bloom_filter", aggFnOIs, false, false), + params, false, Mode.PARTIAL1); + GenericUDAFBloomFilterEvaluator bloomFilterEval = (GenericUDAFBloomFilterEvaluator) bloomFilter.getGenericUDAFEvaluator(); + bloomFilterEval.setSourceOperator(selectOp); + bloomFilter.setGenericUDAFWritableEvaluator(bloomFilterEval); + aggs.add(min); + aggs.add(max); + aggs.add(bloomFilter); + } catch (SemanticException e) { + LOG.error("Error creating min/max aggregations on key", e); + throw new IllegalStateException("Error creating min/max aggregations on key", e); + } + + // Create the Group by Operator + ArrayList gbOutputNames = new ArrayList(); + gbOutputNames.add(SemanticAnalyzer.getColumnInternalName(0)); + gbOutputNames.add(SemanticAnalyzer.getColumnInternalName(1)); + gbOutputNames.add(SemanticAnalyzer.getColumnInternalName(2)); + GroupByDesc groupBy = new GroupByDesc(GroupByDesc.Mode.HASH, + gbOutputNames, new ArrayList(), aggs, false, + groupByMemoryUsage, memoryThreshold, null, false, 0, false); + + ArrayList groupbyColInfos = new ArrayList(); + groupbyColInfos.add(new ColumnInfo(gbOutputNames.get(0), key.getTypeInfo(), "", false)); + groupbyColInfos.add(new ColumnInfo(gbOutputNames.get(1), key.getTypeInfo(), "", false)); + groupbyColInfos.add(new ColumnInfo(gbOutputNames.get(2), key.getTypeInfo(), "", false)); + + GroupByOperator groupByOp = (GroupByOperator)OperatorFactory.getAndMakeChild( + groupBy, new RowSchema(groupbyColInfos), selectOp); + + groupByOp.setColumnExprMap(new HashMap()); + + // Get the column names of the aggregations for reduce sink + int colPos = 0; + ArrayList rsValueCols = new ArrayList(); + for (int i = 0; i < aggs.size() - 1; i++) { + ExprNodeColumnDesc colExpr = new ExprNodeColumnDesc(key.getTypeInfo(), + gbOutputNames.get(colPos++), "", false); + rsValueCols.add(colExpr); + } + + // Bloom Filter uses binary + ExprNodeColumnDesc colExpr = new ExprNodeColumnDesc(TypeInfoFactory.binaryTypeInfo, + gbOutputNames.get(colPos++), "", false); + rsValueCols.add(colExpr); + + // Create the reduce sink operator + ReduceSinkDesc rsDesc = PlanUtils.getReduceSinkDesc( + new ArrayList(), rsValueCols, gbOutputNames, false, + -1, 0, 1, Operation.NOT_ACID); + ReduceSinkOperator rsOp = (ReduceSinkOperator)OperatorFactory.getAndMakeChild( + rsDesc, new RowSchema(groupByOp.getSchema()), groupByOp); + Map columnExprMap = new HashMap(); + rsOp.setColumnExprMap(columnExprMap); + + // Create the final Group By Operator + ArrayList aggsFinal = new ArrayList(); + try { + List minFinalFnOIs = new ArrayList(); + List maxFinalFnOIs = new ArrayList(); + List bloomFilterFinalFnOIs = new ArrayList(); + ArrayList minFinalParams = new ArrayList(); + ArrayList maxFinalParams = new ArrayList(); + ArrayList bloomFilterFinalParams = new ArrayList(); + // Use the expressions from Reduce Sink. + minFinalFnOIs.add(rsValueCols.get(0).getWritableObjectInspector()); + maxFinalFnOIs.add(rsValueCols.get(1).getWritableObjectInspector()); + bloomFilterFinalFnOIs.add(rsValueCols.get(2).getWritableObjectInspector()); + // Coming from a ReduceSink the aggregations would be in the form VALUE._col0, VALUE._col1 + minFinalParams.add( + new ExprNodeColumnDesc( + rsValueCols.get(0).getTypeInfo(), + Utilities.ReduceField.VALUE + "." + + gbOutputNames.get(0), "", false)); + maxFinalParams.add( + new ExprNodeColumnDesc( + rsValueCols.get(1).getTypeInfo(), + Utilities.ReduceField.VALUE + "." + + gbOutputNames.get(1), "", false)); + bloomFilterFinalParams.add( + new ExprNodeColumnDesc( + rsValueCols.get(2).getTypeInfo(), + Utilities.ReduceField.VALUE + "." + + gbOutputNames.get(2), "", false)); + + AggregationDesc min = new AggregationDesc("min", + FunctionRegistry.getGenericUDAFEvaluator("min", minFinalFnOIs, + false, false), + minFinalParams, false, Mode.FINAL); + AggregationDesc max = new AggregationDesc("max", + FunctionRegistry.getGenericUDAFEvaluator("max", maxFinalFnOIs, + false, false), + maxFinalParams, false, Mode.FINAL); + AggregationDesc bloomFilter = new AggregationDesc("bloom_filter", + FunctionRegistry.getGenericUDAFEvaluator("bloom_filter", bloomFilterFinalFnOIs, + false, false), + bloomFilterFinalParams, false, Mode.FINAL); + GenericUDAFBloomFilterEvaluator bloomFilterEval = (GenericUDAFBloomFilterEvaluator) bloomFilter.getGenericUDAFEvaluator(); + bloomFilterEval.setSourceOperator(selectOp); + bloomFilter.setGenericUDAFWritableEvaluator(bloomFilterEval); + + aggsFinal.add(min); + aggsFinal.add(max); + aggsFinal.add(bloomFilter); + } catch (SemanticException e) { + LOG.error("Error creating min/max aggregations on key", e); + throw new IllegalStateException("Error creating min/max aggregations on key", e); + } + + GroupByDesc groupByDescFinal = new GroupByDesc(GroupByDesc.Mode.FINAL, + gbOutputNames, new ArrayList(), aggsFinal, false, + groupByMemoryUsage, memoryThreshold, null, false, 0, false); + GroupByOperator groupByOpFinal = (GroupByOperator)OperatorFactory.getAndMakeChild( + groupByDescFinal, new RowSchema(rsOp.getSchema()), rsOp); + groupByOpFinal.setColumnExprMap(new HashMap()); + + // Create the final Reduce Sink Operator + ReduceSinkDesc rsDescFinal = PlanUtils.getReduceSinkDesc( + new ArrayList(), rsValueCols, gbOutputNames, false, + -1, 0, 1, Operation.NOT_ACID); + ReduceSinkOperator rsOpFinal = (ReduceSinkOperator)OperatorFactory.getAndMakeChild( + rsDescFinal, new RowSchema(groupByOpFinal.getSchema()), groupByOpFinal); + rsOpFinal.setColumnExprMap(columnExprMap); + + LOG.debug("DynamicMinMaxPushdown: Saving RS to TS mapping: " + rsOpFinal + ": " + ts); + parseContext.getRsOpToTsOpMap().put(rsOpFinal, ts); + + // Save the info that is required at query time to resolve dynamic/runtime values. + RuntimeValuesInfo runtimeValuesInfo = new RuntimeValuesInfo(); + TableDesc rsFinalTableDesc = PlanUtils.getReduceValueTableDesc( + PlanUtils.getFieldSchemasFromColumnList(rsValueCols, "_col")); + List dynamicValueIDs = new ArrayList(); + dynamicValueIDs.add(keyBaseAlias + "_min"); + dynamicValueIDs.add(keyBaseAlias + "_max"); + dynamicValueIDs.add(keyBaseAlias + "_bloom_filter"); + + runtimeValuesInfo.setTableDesc(rsFinalTableDesc); + runtimeValuesInfo.setDynamicValueIDs(dynamicValueIDs); + runtimeValuesInfo.setColExprs(rsValueCols); + parseContext.getRsToRuntimeValuesInfoMap().put(rsOpFinal, runtimeValuesInfo); + + return true; + } + private Map collectDynamicPruningConditions(ExprNodeDesc pred, NodeProcessorCtx ctx) throws SemanticException { diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/FixedBucketPruningOptimizer.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/FixedBucketPruningOptimizer.java index 9e9beb0..b853a06 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/FixedBucketPruningOptimizer.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/FixedBucketPruningOptimizer.java @@ -135,7 +135,7 @@ protected void generatePredicate(NodeProcessorCtx procCtx, return; } // the sargs are closely tied to hive.optimize.index.filter - SearchArgument sarg = ConvertAstToSearchArg.create(filter); + SearchArgument sarg = ConvertAstToSearchArg.create(ctxt.pctx.getConf(), filter); if (sarg == null) { return; } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/RedundantDynamicPruningConditionsRemoval.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/RedundantDynamicPruningConditionsRemoval.java index d9ce017..b8a60f9 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/RedundantDynamicPruningConditionsRemoval.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/RedundantDynamicPruningConditionsRemoval.java @@ -24,6 +24,7 @@ import java.util.Stack; import org.apache.calcite.util.Pair; +import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.ql.exec.FilterOperator; import org.apache.hadoop.hive.ql.exec.FunctionRegistry; import org.apache.hadoop.hive.ql.exec.TableScanOperator; @@ -75,16 +76,19 @@ */ @Override public ParseContext transform(ParseContext pctx) throws SemanticException { - Map opRules = new LinkedHashMap(); - opRules.put(new RuleRegExp("R1", TableScanOperator.getOperatorName() + "%" + - FilterOperator.getOperatorName() + "%"), new FilterTransformer()); - - Dispatcher disp = new DefaultRuleDispatcher(null, opRules, null); - GraphWalker ogw = new DefaultGraphWalker(disp); - - List topNodes = new ArrayList(); - topNodes.addAll(pctx.getTopOps().values()); - ogw.startWalking(topNodes, null); + // Make sure semijoin is not enabled. If it is, then do not remove the dynamic partition pruning predicates. + if (!pctx.getConf().getBoolVar(HiveConf.ConfVars.TEZ_DYNAMIC_SEMIJOIN_REDUCTION)) { + Map opRules = new LinkedHashMap(); + opRules.put(new RuleRegExp("R1", TableScanOperator.getOperatorName() + "%" + + FilterOperator.getOperatorName() + "%"), new FilterTransformer()); + + Dispatcher disp = new DefaultRuleDispatcher(null, opRules, null); + GraphWalker ogw = new DefaultGraphWalker(disp); + + List topNodes = new ArrayList(); + topNodes.addAll(pctx.getTopOps().values()); + ogw.startWalking(topNodes, null); + } return pctx; } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java index aa1e509..fae4de5 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java @@ -51,23 +51,8 @@ import org.apache.hadoop.hive.ql.metadata.Table; import org.apache.hadoop.hive.ql.parse.PrunedPartitionList; import org.apache.hadoop.hive.ql.parse.SemanticException; -import org.apache.hadoop.hive.ql.plan.AggregationDesc; -import org.apache.hadoop.hive.ql.plan.ColStatistics; -import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc; -import org.apache.hadoop.hive.ql.plan.ExprNodeColumnListDesc; -import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc; -import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; +import org.apache.hadoop.hive.ql.plan.*; import org.apache.hadoop.hive.ql.plan.ExprNodeDesc.ExprNodeDescEqualityWrapper; -import org.apache.hadoop.hive.ql.plan.ExprNodeDescUtils; -import org.apache.hadoop.hive.ql.plan.ExprNodeDynamicListDesc; -import org.apache.hadoop.hive.ql.plan.ExprNodeFieldDesc; -import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc; -import org.apache.hadoop.hive.ql.plan.GroupByDesc; -import org.apache.hadoop.hive.ql.plan.JoinCondDesc; -import org.apache.hadoop.hive.ql.plan.JoinDesc; -import org.apache.hadoop.hive.ql.plan.MapJoinDesc; -import org.apache.hadoop.hive.ql.plan.OperatorDesc; -import org.apache.hadoop.hive.ql.plan.Statistics; import org.apache.hadoop.hive.ql.stats.StatsUtils; import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator; import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; @@ -494,6 +479,12 @@ private long evaluateBetweenExpr(Statistics stats, ExprNodeDesc pred, AnnotateSt final ExprNodeDesc leftExpression = fd.getChildren().get(2); // left expression final ExprNodeDesc rightExpression = fd.getChildren().get(3); // right expression + // Short circuit and return the current number of rows if this is a + // synthetic predicate with dynamic values + if (leftExpression instanceof ExprNodeDynamicValueDesc) { + return stats.getNumRows(); + } + // We transform the BETWEEN clause to AND clause (with NOT on top in invert is true). // This is more straightforward, as the evaluateExpression method will deal with // generating the final row count relying on the basic comparator evaluation methods @@ -888,19 +879,25 @@ private long evaluateChildExpr(Statistics stats, ExprNodeDesc child, } else if (udf instanceof GenericUDFOPNotEqual) { return numRows; } else if (udf instanceof GenericUDFOPEqualOrGreaterThan - || udf instanceof GenericUDFOPEqualOrLessThan - || udf instanceof GenericUDFOPGreaterThan - || udf instanceof GenericUDFOPLessThan) { + || udf instanceof GenericUDFOPEqualOrLessThan + || udf instanceof GenericUDFOPGreaterThan + || udf instanceof GenericUDFOPLessThan) { return evaluateComparator(stats, genFunc); } else if (udf instanceof GenericUDFOPNotNull) { - return evaluateNotNullExpr(stats, genFunc); + return evaluateNotNullExpr(stats, genFunc); } else if (udf instanceof GenericUDFOPNull) { return evaluateColEqualsNullExpr(stats, genFunc); } else if (udf instanceof GenericUDFOPAnd || udf instanceof GenericUDFOPOr - || udf instanceof GenericUDFIn || udf instanceof GenericUDFBetween - || udf instanceof GenericUDFOPNot) { + || udf instanceof GenericUDFIn || udf instanceof GenericUDFBetween + || udf instanceof GenericUDFOPNot) { return evaluateExpression(stats, genFunc, aspCtx, neededCols, fop, evaluatedRowCount); } + } else if (child instanceof ExprNodeConstantDesc) { + if (Boolean.FALSE.equals(((ExprNodeConstantDesc) child).getValue())) { + return 0; + } else { + return stats.getNumRows(); + } } // worst case diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/GenTezUtils.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/GenTezUtils.java index e2363eb..4da342b 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/parse/GenTezUtils.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/GenTezUtils.java @@ -20,13 +20,7 @@ import static org.apache.hadoop.hive.ql.plan.ReduceSinkDesc.ReducerTraits.AUTOPARALLEL; -import java.util.ArrayList; -import java.util.Deque; -import java.util.HashSet; -import java.util.Iterator; -import java.util.LinkedList; -import java.util.List; -import java.util.Set; +import java.util.*; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.conf.HiveConf; @@ -42,18 +36,13 @@ import org.apache.hadoop.hive.ql.exec.TableScanOperator; import org.apache.hadoop.hive.ql.exec.UnionOperator; import org.apache.hadoop.hive.ql.exec.Utilities; +import org.apache.hadoop.hive.ql.lib.*; import org.apache.hadoop.hive.ql.optimizer.GenMapRedUtils; -import org.apache.hadoop.hive.ql.plan.BaseWork; -import org.apache.hadoop.hive.ql.plan.DynamicPruningEventDesc; -import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; -import org.apache.hadoop.hive.ql.plan.FileSinkDesc; -import org.apache.hadoop.hive.ql.plan.MapWork; -import org.apache.hadoop.hive.ql.plan.ReduceWork; -import org.apache.hadoop.hive.ql.plan.TableDesc; -import org.apache.hadoop.hive.ql.plan.TezEdgeProperty; +import org.apache.hadoop.hive.ql.plan.*; import org.apache.hadoop.hive.ql.plan.TezEdgeProperty.EdgeType; -import org.apache.hadoop.hive.ql.plan.TezWork; -import org.apache.hadoop.hive.ql.plan.UnionWork; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDFBetween; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDFInBloomFilter; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -269,6 +258,15 @@ public static void removeUnionOperators(GenTezProcContext context, BaseWork work ((DynamicPruningEventDesc) event.getConf()).setTableScan((TableScanOperator) newRoot); } } + // This TableScanOperator could be part of semijoin optimization. + Map rsOpToTsOpMap = + context.parseContext.getRsOpToTsOpMap(); + for (ReduceSinkOperator rs : rsOpToTsOpMap.keySet()) { + if (rsOpToTsOpMap.get(rs) == orig) { + rsOpToTsOpMap.put(rs, (TableScanOperator) newRoot); + break; + } + } } context.rootToWorkMap.remove(orig); context.rootToWorkMap.put(newRoot, work); @@ -479,7 +477,7 @@ private static void findRoots(Operator op, List> ops) { * Remove an operator branch. When we see a fork, we know it's time to do the removal. * @param event the leaf node of which branch to be removed */ - public static void removeBranch(AppMasterEventOperator event) { + public static void removeBranch(Operator event) { Operator child = event; Operator curr = event; @@ -507,4 +505,139 @@ public static EdgeType determineEdgeType(BaseWork preceedingWork, BaseWork follo } return EdgeType.SIMPLE_EDGE; } + + public static void processDynamicMinMaxPushDownOperator( + GenTezProcContext procCtx, RuntimeValuesInfo runtimeValuesInfo, + ReduceSinkOperator rs) + throws SemanticException { + TableScanOperator ts = procCtx.parseContext.getRsOpToTsOpMap().get(rs); + + List rsWorkList = procCtx.childToWorkMap.get(rs); + if (ts == null || rsWorkList == null) { + // This happens when the ReduceSink's edge has been removed by cycle + // detection logic. Nothing to do here. + return; + } + LOG.debug("ResduceSink " + rs + " to TableScan " + ts); + + if (rsWorkList.size() != 1) { + StringBuilder sb = new StringBuilder(); + for (BaseWork curWork : rsWorkList) { + if ( sb.length() > 0) { + sb.append(", "); + } + sb.append(curWork.getName()); + } + throw new SemanticException(rs + " belongs to multiple BaseWorks: " + sb.toString()); + } + + BaseWork parentWork = rsWorkList.get(0); + BaseWork childWork = procCtx.rootToWorkMap.get(ts); + + // Connect parent/child work with a brodacast edge. + LOG.debug("Connecting Baswork - " + parentWork.getName() + " to " + childWork.getName()); + TezEdgeProperty edgeProperty = new TezEdgeProperty(EdgeType.BROADCAST_EDGE); + TezWork tezWork = procCtx.currentTask.getWork(); + tezWork.connect(parentWork, childWork, edgeProperty); + + // Set output names in ReduceSink + rs.getConf().setOutputName(childWork.getName()); + + // Set up the dynamic values in the childWork. + RuntimeValuesInfo childRuntimeValuesInfo = + new RuntimeValuesInfo(); + childRuntimeValuesInfo.setTableDesc(runtimeValuesInfo.getTableDesc()); + childRuntimeValuesInfo.setDynamicValueIDs(runtimeValuesInfo.getDynamicValueIDs()); + childRuntimeValuesInfo.setColExprs(runtimeValuesInfo.getColExprs()); + childWork.setInputSourceToRuntimeValuesInfo( + parentWork.getName(), childRuntimeValuesInfo); + } + + // Functionality to remove semi-join optimization + public static void removeSemiJoinOperator(ParseContext context, + ReduceSinkOperator rs, + TableScanOperator ts) throws SemanticException{ + // Cleanup the synthetic predicate in the tablescan operator by + // replacing it with "true" + LOG.debug("Removing ReduceSink " + rs + " and TableScan " + ts); + ExprNodeDesc constNode = new ExprNodeConstantDesc( + TypeInfoFactory.booleanTypeInfo, Boolean.TRUE); + DynamicValuePredicateContext filterDynamicValuePredicatesCollection = + new DynamicValuePredicateContext(); + collectDynamicValuePredicates(ts.getConf().getFilterExpr(), + filterDynamicValuePredicatesCollection); + for (ExprNodeDesc nodeToRemove : filterDynamicValuePredicatesCollection + .childParentMapping.keySet()) { + // Find out if this synthetic predicate belongs to the current cycle + boolean skip = true; + for (ExprNodeDesc expr : nodeToRemove.getChildren()) { + if (expr instanceof ExprNodeDynamicValueDesc ) { + String dynamicValueIdFromExpr = ((ExprNodeDynamicValueDesc) expr) + .getDynamicValue().getId(); + List dynamicValueIdsFromMap = context. + getRsToRuntimeValuesInfoMap().get(rs).getDynamicValueIDs(); + for (String dynamicValueIdFromMap : dynamicValueIdsFromMap) { + if (dynamicValueIdFromExpr.equals(dynamicValueIdFromMap)) { + // Intended predicate to be removed + skip = false; + break; + } + } + } + } + if (!skip) { + ExprNodeDesc nodeParent = filterDynamicValuePredicatesCollection + .childParentMapping.get(nodeToRemove); + if (nodeParent == null) { + // This was the only predicate, set filter expression to null + ts.getConf().setFilterExpr(null); + } else { + int i = nodeParent.getChildren().indexOf(nodeToRemove); + nodeParent.getChildren().remove(i); + nodeParent.getChildren().add(i, constNode); + } + // skip the rest of the predicates + skip = true; + } + } + } + + private static class DynamicValuePredicateContext implements NodeProcessorCtx { + HashMap childParentMapping = new HashMap(); + } + + private static class DynamicValuePredicateProc implements NodeProcessor { + + @Override + public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, + Object... nodeOutputs) throws SemanticException { + DynamicValuePredicateContext ctx = (DynamicValuePredicateContext) procCtx; + ExprNodeDesc parent = (ExprNodeDesc) stack.get(stack.size() - 2); + if (parent instanceof ExprNodeGenericFuncDesc) { + ExprNodeGenericFuncDesc parentFunc = (ExprNodeGenericFuncDesc) parent; + if (parentFunc.getGenericUDF() instanceof GenericUDFBetween || + parentFunc.getGenericUDF() instanceof GenericUDFInBloomFilter) { + ExprNodeDesc grandParent = stack.size() >= 3 ? + (ExprNodeDesc) stack.get(stack.size() - 3) : null; + ctx.childParentMapping.put(parentFunc, grandParent); + } + } + + return null; + } + } + + private static void collectDynamicValuePredicates(ExprNodeDesc pred, NodeProcessorCtx ctx) throws SemanticException { + // create a walker which walks the tree in a DFS manner while maintaining + // the operator stack. The dispatcher + // generates the plan from the operator tree + Map exprRules = new LinkedHashMap(); + exprRules.put(new RuleRegExp("R1", ExprNodeDynamicValueDesc.class.getName() + "%"), new DynamicValuePredicateProc()); + Dispatcher disp = new DefaultRuleDispatcher(null, exprRules, ctx); + GraphWalker egw = new DefaultGraphWalker(disp); + List startNodes = new ArrayList(); + startNodes.add(pred); + + egw.startWalking(startNodes, null); + } } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/ParseContext.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/ParseContext.java index 35f34da..3f9f76c 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/parse/ParseContext.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/ParseContext.java @@ -50,6 +50,7 @@ import org.apache.hadoop.hive.ql.optimizer.ppr.PartitionPruner; import org.apache.hadoop.hive.ql.optimizer.unionproc.UnionProcContext; import org.apache.hadoop.hive.ql.parse.BaseSemanticAnalyzer.AnalyzeRewriteContext; +import org.apache.hadoop.hive.ql.parse.RuntimeValuesInfo; import org.apache.hadoop.hive.ql.plan.CreateTableDesc; import org.apache.hadoop.hive.ql.plan.CreateViewDesc; import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; @@ -125,6 +126,11 @@ private boolean needViewColumnAuthorization; private Set acidFileSinks = Collections.emptySet(); + // Map to store mapping between reduce sink Operator and TS Operator for semijoin + private Map rsOpToTsOpMap = + new HashMap(); + private Map rsToRuntimeValuesInfo = + new HashMap(); public ParseContext() { } @@ -652,4 +658,19 @@ private static void getAllOps(List builder, Set visited, Ope } } + public void setRsToRuntimeValuesInfoMap(Map rsToRuntimeValuesInfo) { + this.rsToRuntimeValuesInfo = rsToRuntimeValuesInfo; + } + + public Map getRsToRuntimeValuesInfoMap() { + return rsToRuntimeValuesInfo; + } + + public void setRsOpToTsOpMap(Map rsOpToTsOpMap) { + this.rsOpToTsOpMap = rsOpToTsOpMap; + } + + public Map getRsOpToTsOpMap() { + return rsOpToTsOpMap; + } } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/RuntimeValuesInfo.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/RuntimeValuesInfo.java new file mode 100644 index 0000000..e1f78f7 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/RuntimeValuesInfo.java @@ -0,0 +1,44 @@ +package org.apache.hadoop.hive.ql.parse; + +import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; +import org.apache.hadoop.hive.ql.plan.TableDesc; + +import java.io.Serializable; +import java.util.List; + +/** + * Holds structures required for runtime values and mappings. + */ +public class RuntimeValuesInfo implements Serializable { + private static final long serialVersionUID = 1L; + + private TableDesc tableDesc; + private List dynamicValueIDs; + private List colExprs; + + // get-set methods + public TableDesc getTableDesc() { + return tableDesc; + } + + public void setTableDesc(TableDesc tableDesc) { + this.tableDesc = tableDesc; + } + + public List getDynamicValueIDs() { + return dynamicValueIDs; + } + + public void setDynamicValueIDs(List dynamicValueIDs) { + this.dynamicValueIDs = dynamicValueIDs; + } + + public List getColExprs() { + return colExprs; + } + + public void setColExprs(List colExprs) { + this.colExprs = colExprs; + } +} + diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/TaskCompiler.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/TaskCompiler.java index e8b003e..5f9ccc8 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/parse/TaskCompiler.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/TaskCompiler.java @@ -530,6 +530,9 @@ public ParseContext getParseContext(ParseContext pCtx, List inputs, runCycleAnalysisForPartitionPruning(procCtx, inputs, outputs); perfLogger.PerfLogEnd(this.getClass().getName(), PerfLogger.TEZ_COMPILER, "Run cycle analysis for partition pruning"); + perfLogger.PerfLogBegin(this.getClass().getName(), PerfLogger.TEZ_COMPILER); + // Remove semijoin optimization if it creates a cycle with mapside joins + removeSemiJoinCyclesDueToMapsideJoins(procCtx); + perfLogger.PerfLogEnd(this.getClass().getName(), PerfLogger.TEZ_COMPILER, "Remove semijoin optimizations if it creates a cycle with mapside join"); + + perfLogger.PerfLogBegin(this.getClass().getName(), PerfLogger.TEZ_COMPILER); + // Remove semijoin optimization if SMB join is created. + removeSemijoinOptimizationFromSMBJoins(procCtx); + perfLogger.PerfLogEnd(this.getClass().getName(), PerfLogger.TEZ_COMPILER, "Remove semijoin optimizations if needed"); + + perfLogger.PerfLogBegin(this.getClass().getName(), PerfLogger.TEZ_COMPILER); + // Remove bloomfilter if no stats generated + removeSemiJoinIfNoStats(procCtx); + perfLogger.PerfLogEnd(this.getClass().getName(), PerfLogger.TEZ_COMPILER, "Remove bloom filter optimizations if needed"); + + // need a new run of the constant folding because we might have created lots + // of "and true and true" conditions. + // Rather than run the full constant folding just need to shortcut AND/OR expressions + // involving constant true/false values. + if(procCtx.conf.getBoolVar(ConfVars.HIVEOPTCONSTANTPROPAGATION)) { + new ConstantPropagate(ConstantPropagateOption.SHORTCUT).transform(procCtx.parseContext); + } + } private void runCycleAnalysisForPartitionPruning(OptimizeTezProcContext procCtx, @@ -163,7 +153,7 @@ private void runCycleAnalysisForPartitionPruning(OptimizeTezProcContext procCtx, if (component.size() != 1) { LOG.info("Found cycle in operator plan..."); cycleFree = false; - removeEventOperator(component, procCtx); + removeCycleOperator(component, procCtx); break; } } @@ -171,29 +161,72 @@ private void runCycleAnalysisForPartitionPruning(OptimizeTezProcContext procCtx, } } - private void removeEventOperator(Set> component, OptimizeTezProcContext context) { - AppMasterEventOperator victim = null; + private void removeCycleOperator(Set> component, OptimizeTezProcContext context) throws SemanticException { + AppMasterEventOperator victimAM = null; + TableScanOperator victimTS = null; + ReduceSinkOperator victimRS = null; + for (Operator o : component) { + // Look for AppMasterEventOperator or ReduceSinkOperator if (o instanceof AppMasterEventOperator) { - if (victim == null - || o.getConf().getStatistics().getDataSize() < victim.getConf().getStatistics() + if (victimAM == null + || o.getStatistics().getDataSize() < victimAM.getStatistics() .getDataSize()) { - victim = (AppMasterEventOperator) o; + victimAM = (AppMasterEventOperator) o; + } + } else if (o instanceof ReduceSinkOperator) { + TableScanOperator ts = context.parseContext.getRsOpToTsOpMap().get(o); + if (ts == null) { + continue; + } + // Sanity check + assert component.contains(ts); + + if (victimRS == null || + o.getStatistics().getDataSize() > + victimRS.getStatistics().getDataSize()) { + victimRS = (ReduceSinkOperator) o; + victimTS = ts; } } } + // Always set the min/max optimization as victim. + Operator victim = victimRS; + + if (victimRS == null && victimAM != null ) { + victim = victimAM; + } else if (victimAM == null) { + // do nothing + } else { + // Cycle consists of atleast one dynamic partition pruning(DPP) + // optimization and atleast one min/max optimization. + // DPP is a better optimization unless it ends up scanning the + // bigger table for keys instead of the smaller table. + + // Get the parent TS of victimRS. + Operator op = victimRS; + while(!(op instanceof TableScanOperator)) { + op = op.getParentOperators().get(0); + } + if ((2 * op.getStatistics().getDataSize()) < + victimAM.getStatistics().getDataSize()) { + victim = victimAM; + } + } + if (victim == null || - (!context.pruningOpsRemovedByPriorOpt.isEmpty() && - context.pruningOpsRemovedByPriorOpt.contains(victim))) { + (!context.pruningOpsRemovedByPriorOpt.isEmpty() && + context.pruningOpsRemovedByPriorOpt.contains(victim))) { return; } GenTezUtils.removeBranch(victim); - // at this point we've found the fork in the op pipeline that has the pruning as a child plan. - LOG.info("Disabling dynamic pruning for: " - + ((DynamicPruningEventDesc) victim.getConf()).getTableScan().toString() - + ". Needed to break cyclic dependency"); + + if (victim == victimRS) { + GenTezUtils.removeSemiJoinOperator(context.parseContext, victimRS, victimTS); + } + return; } // Tarjan's algo @@ -205,11 +238,11 @@ private void removeEventOperator(Set> component, OptimizeTezProcCont Map, Integer> indexes = new HashMap, Integer>(); Map, Integer> lowLinks = new HashMap, Integer>(); Stack> nodes = new Stack>(); - Set>> components = new HashSet>>(); + Set>> components = new LinkedHashSet>>(); for (Operator o : deque) { if (!indexes.containsKey(o)) { - connect(o, index, nodes, indexes, lowLinks, components); + connect(o, index, nodes, indexes, lowLinks, components, procCtx.parseContext); } } @@ -218,7 +251,7 @@ private void removeEventOperator(Set> component, OptimizeTezProcCont private void connect(Operator o, AtomicInteger index, Stack> nodes, Map, Integer> indexes, Map, Integer> lowLinks, - Set>> components) { + Set>> components, ParseContext parseContext) { indexes.put(o, index.get()); lowLinks.put(o, index.get()); @@ -232,13 +265,22 @@ private void connect(Operator o, AtomicInteger index, Stack> node TableScanOperator ts = ((DynamicPruningEventDesc) o.getConf()).getTableScan(); LOG.debug("Adding special edge: " + o.getName() + " --> " + ts.toString()); children.add(ts); + } else if (o instanceof ReduceSinkOperator){ + // min/max case + children = new ArrayList>(); + children.addAll(o.getChildOperators()); + TableScanOperator ts = parseContext.getRsOpToTsOpMap().get(o); + if (ts != null) { + LOG.debug("Adding special edge: " + o.getName() + " --> " + ts.toString()); + children.add(ts); + } } else { children = o.getChildOperators(); } for (Operator child : children) { if (!indexes.containsKey(child)) { - connect(child, index, nodes, indexes, lowLinks, components); + connect(child, index, nodes, indexes, lowLinks, components, parseContext); lowLinks.put(o, Math.min(lowLinks.get(o), lowLinks.get(child))); } else if (nodes.contains(child)) { lowLinks.put(o, Math.min(lowLinks.get(o), indexes.get(child))); @@ -246,7 +288,7 @@ private void connect(Operator o, AtomicInteger index, Stack> node } if (lowLinks.get(o).equals(indexes.get(o))) { - Set> component = new HashSet>(); + Set> component = new LinkedHashSet>(); components.add(component); Operator current; do { @@ -315,14 +357,6 @@ private void runDynamicPartitionPruning(OptimizeTezProcContext procCtx, Set> rootTasks, Pa GenTezUtils.processFileSink(procCtx, fileSink); } + // Connect any edges required for min/max pushdown + if (pCtx.getRsToRuntimeValuesInfoMap().size() > 0) { + for (ReduceSinkOperator rs : pCtx.getRsToRuntimeValuesInfoMap().keySet()) { + // Process min/max + GenTezUtils.processDynamicMinMaxPushDownOperator( + procCtx, pCtx.getRsToRuntimeValuesInfoMap().get(rs), rs); + } + } // and finally we hook up any events that need to be sent to the tez AM LOG.debug("There are " + procCtx.eventOperatorSet.size() + " app master events."); for (AppMasterEventOperator event : procCtx.eventOperatorSet) { @@ -528,4 +570,255 @@ protected void optimizeTaskPlan(List> rootTasks, Pa perfLogger.PerfLogEnd(this.getClass().getName(), PerfLogger.TEZ_COMPILER, "optimizeTaskPlan"); return; } + + private static class SemijoinRemovalContext implements NodeProcessorCtx { + List> parents = new ArrayList>(); + } + + private static class SemijoinRemovalProc implements NodeProcessor { + + @Override + public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, + Object... nodeOutputs) throws SemanticException { + SemijoinRemovalContext ctx = (SemijoinRemovalContext) procCtx; + Operator parent = (Operator) stack.get(stack.size() - 2); + ctx.parents.add(parent); + return null; + } + } + + private static void collectSemijoinOps(Operator ts, NodeProcessorCtx ctx) throws SemanticException { + // create a walker which walks the tree in a DFS manner while maintaining + // the operator stack. The dispatcher + // generates the plan from the operator tree + Map opRules = new LinkedHashMap(); + opRules.put(new RuleRegExp("R1", SelectOperator.getOperatorName() + "%" + + TezDummyStoreOperator.getOperatorName() + "%"), + new SemijoinRemovalProc()); + opRules.put(new RuleRegExp("R2", SelectOperator.getOperatorName() + "%" + + CommonMergeJoinOperator.getOperatorName() + "%"), + new SemijoinRemovalProc()); + Dispatcher disp = new DefaultRuleDispatcher(null, opRules, ctx); + GraphWalker ogw = new PreOrderOnceWalker(disp); + List startNodes = new ArrayList(); + startNodes.add(ts); + + HashMap outputMap = new HashMap(); + ogw.startWalking(startNodes, null); + } + + private static class SMBJoinOpProc implements NodeProcessor { + + @Override + public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, + Object... nodeOutputs) throws SemanticException { + List tsOps = new ArrayList(); + // Get one top level TS Op directly from the stack + tsOps.add((TableScanOperator)stack.get(0)); + + // Get the other one by examining Join Op + List> parents = ((CommonMergeJoinOperator) nd).getParentOperators(); + for (Operator parent : parents) { + if (parent instanceof TezDummyStoreOperator) { + // already accounted for + continue; + } + + assert parent instanceof SelectOperator; + while(parent != null) { + if (parent instanceof TableScanOperator) { + tsOps.add((TableScanOperator) parent); + break; + } + parent = parent.getParentOperators().get(0); + } + } + + // Now the relevant TableScanOperators are known, find if there exists + // a semijoin filter on any of them, if so, remove it. + ParseContext pctx = ((OptimizeTezProcContext) procCtx).parseContext; + for (TableScanOperator ts : tsOps) { + for (ReduceSinkOperator rs : pctx.getRsOpToTsOpMap().keySet()) { + if (ts == pctx.getRsOpToTsOpMap().get(rs)) { + // match! + GenTezUtils.removeBranch(rs); + GenTezUtils.removeSemiJoinOperator(pctx, rs, ts); + } + } + } + return null; + } + } + + private static void removeSemijoinOptimizationFromSMBJoins( + OptimizeTezProcContext procCtx) throws SemanticException { + if (!procCtx.conf.getBoolVar(ConfVars.TEZ_DYNAMIC_SEMIJOIN_REDUCTION) || + procCtx.parseContext.getRsOpToTsOpMap().size() == 0) { + return; + } + + Map opRules = new LinkedHashMap(); + opRules.put( + new RuleRegExp("R1", TableScanOperator.getOperatorName() + "%" + + ".*" + TezDummyStoreOperator.getOperatorName() + "%" + + CommonMergeJoinOperator.getOperatorName() + "%"), + new SMBJoinOpProc()); + + // The dispatcher finds SMB and if there is semijoin optimization before it, removes it. + Dispatcher disp = new DefaultRuleDispatcher(null, opRules, procCtx); + List topNodes = new ArrayList(); + topNodes.addAll(procCtx.parseContext.getTopOps().values()); + GraphWalker ogw = new PreOrderOnceWalker(disp); + ogw.startWalking(topNodes, null); + } + + private static class SemiJoinCycleRemovalDueToMapsideJoins implements NodeProcessor { + + @Override + public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, + Object... nodeOutputs) throws SemanticException { + ParseContext pCtx = ((OptimizeTezProcContext) procCtx).parseContext; + Operator childJoin = ((Operator) nd); + Operator parentJoin = ((Operator) stack.get(stack.size() - 2)); + + if (parentJoin.getChildOperators().size() == 1) { + // Nothing to do here + return null; + } + + for (Operator child : parentJoin.getChildOperators()) { + if (!(child instanceof SelectOperator)) { + continue; + } + + while(child.getChildOperators().size() > 0) { + child = child.getChildOperators().get(0); + } + + if (!(child instanceof ReduceSinkOperator)) { + continue; + } + + ReduceSinkOperator rs = ((ReduceSinkOperator) child); + TableScanOperator ts = pCtx.getRsOpToTsOpMap().get(rs); + if (ts == null) { + continue; + } + // This is a semijoin branch. Find if this is creating a potential + // cycle with childJoin. + for (Operator parent : childJoin.getParentOperators()) { + if (parent == parentJoin) { + continue; + } + + assert parent instanceof ReduceSinkOperator; + while (parent.getParentOperators().size() > 0) { + parent = parent.getParentOperators().get(0); + } + + if (parent == ts) { + // We have a cycle! + GenTezUtils.removeBranch(rs); + GenTezUtils.removeSemiJoinOperator(pCtx, rs, ts); + } + } + } + return null; + } + } + + private static void removeSemiJoinCyclesDueToMapsideJoins( + OptimizeTezProcContext procCtx) throws SemanticException { + if (!procCtx.conf.getBoolVar(ConfVars.TEZ_DYNAMIC_SEMIJOIN_REDUCTION) || + procCtx.parseContext.getRsOpToTsOpMap().size() == 0) { + return; + } + + Map opRules = new LinkedHashMap(); + opRules.put( + new RuleRegExp("R1", MapJoinOperator.getOperatorName() + "%" + + MapJoinOperator.getOperatorName() + "%"), + new SemiJoinCycleRemovalDueToMapsideJoins()); + opRules.put( + new RuleRegExp("R2", MapJoinOperator.getOperatorName() + "%" + + CommonMergeJoinOperator.getOperatorName() + "%"), + new SemiJoinCycleRemovalDueToMapsideJoins()); + opRules.put( + new RuleRegExp("R3", CommonMergeJoinOperator.getOperatorName() + "%" + + MapJoinOperator.getOperatorName() + "%"), + new SemiJoinCycleRemovalDueToMapsideJoins()); + opRules.put( + new RuleRegExp("R4", CommonMergeJoinOperator.getOperatorName() + "%" + + CommonMergeJoinOperator.getOperatorName() + "%"), + new SemiJoinCycleRemovalDueToMapsideJoins()); + + Dispatcher disp = new DefaultRuleDispatcher(null, opRules, procCtx); + List topNodes = new ArrayList(); + topNodes.addAll(procCtx.parseContext.getTopOps().values()); + GraphWalker ogw = new PreOrderOnceWalker(disp); + ogw.startWalking(topNodes, null); + } + + private static class SemiJoinRemovalIfNoStatsProc implements NodeProcessor { + + @Override + public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, + Object... nodeOutputs) throws SemanticException { + assert nd instanceof ReduceSinkOperator; + ReduceSinkOperator rs = (ReduceSinkOperator) nd; + ParseContext pCtx = ((OptimizeTezProcContext) procCtx).parseContext; + TableScanOperator ts = pCtx.getRsOpToTsOpMap().get(rs); + if (ts == null) { + // nothing to do here. + return null; + } + + // This is a semijoin branch. The stack should look like, + // -SEL-GB1-RS1-GB2-RS2 + GroupByOperator gbOp = (GroupByOperator) (stack.get(stack.size() - 2)); + GroupByDesc gbDesc = gbOp.getConf(); + ArrayList aggregationDescs = gbDesc.getAggregators(); + boolean removeSemiJoin = false; + for (AggregationDesc agg : aggregationDescs) { + if (agg.getGenericUDAFName() != "bloom_filter") { + continue; + } + + GenericUDAFBloomFilterEvaluator udafBloomFilterEvaluator = + (GenericUDAFBloomFilterEvaluator) agg.getGenericUDAFEvaluator(); + if (udafBloomFilterEvaluator.getSourceOperator().getStatistics(). + getNumRows() == -1) { + removeSemiJoin = true; + break; + } + } + if (removeSemiJoin) { + // The stats are not annotated, remove the semijoin operator + GenTezUtils.removeBranch(rs); + GenTezUtils.removeSemiJoinOperator(pCtx, rs, ts); + } + return null; + } + } + + private void removeSemiJoinIfNoStats(OptimizeTezProcContext procCtx) + throws SemanticException { + if(!procCtx.conf.getBoolVar(ConfVars.TEZ_DYNAMIC_SEMIJOIN_REDUCTION)) { + // Not needed without semi-join reduction + return; + } + + Map opRules = new LinkedHashMap(); + opRules.put( + new RuleRegExp("R1", GroupByOperator.getOperatorName() + "%" + + ReduceSinkOperator.getOperatorName() + "%" + + GroupByOperator.getOperatorName() + "%" + + ReduceSinkOperator.getOperatorName() + "%"), + new SemiJoinRemovalIfNoStatsProc()); + Dispatcher disp = new DefaultRuleDispatcher(null, opRules, procCtx); + List topNodes = new ArrayList(); + topNodes.addAll(procCtx.parseContext.getTopOps().values()); + GraphWalker ogw = new PreOrderOnceWalker(disp); + ogw.startWalking(topNodes, null); + } } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/plan/AggregationDesc.java b/ql/src/java/org/apache/hadoop/hive/ql/plan/AggregationDesc.java index 1ecbaad..f0b062e 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/plan/AggregationDesc.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/plan/AggregationDesc.java @@ -152,6 +152,13 @@ public String getExprString() { } sb.append(exp.getExprString()); } + + String evaluatorExpr = getGenericUDAFEvaluator().getExprString(); + if (evaluatorExpr != null && !evaluatorExpr.isEmpty()) { + sb.append(", "); + sb.append(evaluatorExpr); + } + sb.append(")"); return sb.toString(); } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/plan/BaseWork.java b/ql/src/java/org/apache/hadoop/hive/ql/plan/BaseWork.java index 13a0811..8c341fc 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/plan/BaseWork.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/plan/BaseWork.java @@ -19,6 +19,7 @@ package org.apache.hadoop.hive.ql.plan; import java.util.ArrayList; +import java.util.HashMap; import java.util.LinkedList; import java.util.LinkedHashSet; import java.util.List; @@ -32,6 +33,7 @@ import org.apache.hadoop.hive.ql.exec.HashTableDummyOperator; import org.apache.hadoop.hive.ql.exec.Operator; import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatchCtx; +import org.apache.hadoop.hive.ql.parse.RuntimeValuesInfo; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.hive.ql.plan.Explain.Level; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; @@ -76,6 +78,10 @@ public BaseWork(String name) { private int reservedMemoryMB = -1; // default to -1 means we leave it up to Tez to decide + // Used for value registry + private Map inputSourceToRuntimeValuesInfo = + new HashMap(); + public void setGatheringStats(boolean gatherStats) { this.gatheringStats = gatherStats; } @@ -251,4 +257,13 @@ public void addSortCols(List sortCols) { public List getSortCols() { return sortColNames; } + + public Map getInputSourceToRuntimeValuesInfo() { + return inputSourceToRuntimeValuesInfo; + } + + public void setInputSourceToRuntimeValuesInfo( + String workName, RuntimeValuesInfo runtimeValuesInfo) { + inputSourceToRuntimeValuesInfo.put(workName, runtimeValuesInfo); + } } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/plan/DynamicValue.java b/ql/src/java/org/apache/hadoop/hive/ql/plan/DynamicValue.java new file mode 100644 index 0000000..874c62b --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/plan/DynamicValue.java @@ -0,0 +1,137 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.plan; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.ql.exec.DynamicValueRegistry; +import org.apache.hadoop.hive.ql.exec.ObjectCache; +import org.apache.hadoop.hive.ql.exec.ObjectCacheFactory; +import org.apache.hadoop.hive.ql.exec.Utilities; +import org.apache.hadoop.hive.ql.io.sarg.LiteralDelegate; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; + +import java.io.Serializable; + + +public class DynamicValue implements LiteralDelegate, Serializable { + + private static final long serialVersionUID = 1L; + + public static final String DYNAMIC_VALUE_REGISTRY_CACHE_KEY = "DynamicValueRegistry"; + + protected transient Configuration conf; + + protected String id; + TypeInfo typeInfo; + PrimitiveObjectInspector objectInspector; + + transient protected Object val; + transient boolean initialized = false; + + public DynamicValue(String id, TypeInfo typeInfo) { + this.id = id; + this.typeInfo = typeInfo; + this.objectInspector = (PrimitiveObjectInspector) TypeInfoUtils.getStandardWritableObjectInspectorFromTypeInfo(typeInfo); + } + + @Override + public void setConf(Configuration conf) { + this.conf = conf; + } + + @Override + public Configuration getConf() { + return conf; + } + + public TypeInfo getTypeInfo() { + return typeInfo; + } + + public void setTypeInfo(TypeInfo typeInfo) { + this.typeInfo = typeInfo; + } + + public PrimitiveObjectInspector getObjectInspector() { + return objectInspector; + } + + public void setObjectInspector(PrimitiveObjectInspector objectInspector) { + this.objectInspector = objectInspector; + } + + @Override + public String getId() { return id;} + + public void setId(String id) { + this.id = id; + } + + @Override + public Object getLiteral() { + return getJavaValue(); + } + + public Object getJavaValue() { + return objectInspector.getPrimitiveJavaObject(getValue()); + } + + public Object getWritableValue() { + return objectInspector.getPrimitiveWritableObject(getValue()); + } + + public Object getValue() { + if (initialized) { + return val; + } + + if (conf == null) { + throw new IllegalStateException("Cannot retrieve dynamic value " + id + " - no conf set"); + } + + try { + // Get object cache + String queryId = HiveConf.getVar(conf, HiveConf.ConfVars.HIVEQUERYID); + ObjectCache cache = ObjectCacheFactory.getCache(conf, queryId, false); + + // Get the registry + DynamicValueRegistry valueRegistry = cache.retrieve(DYNAMIC_VALUE_REGISTRY_CACHE_KEY); + if (valueRegistry == null) { + throw new IllegalStateException("DynamicValueRegistry not available"); + } + val = valueRegistry.getValue(id); + initialized = true; + } catch (Exception err) { + throw new IllegalStateException("Failed to retrieve dynamic value for " + id, err); + } + + return val; + } + + @Override + public String toString() { + // If the id is a generated unique ID then this could affect .q file golden files for tests that run EXPLAIN queries. + return "DynamicValue(" + id + ")"; + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeDynamicValueDesc.java b/ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeDynamicValueDesc.java new file mode 100644 index 0000000..c9e7b67 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeDynamicValueDesc.java @@ -0,0 +1,76 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.plan; + +import java.io.Serializable; + + +/** + * This expression represents a value that will be available at runtime. + * + */ +public class ExprNodeDynamicValueDesc extends ExprNodeDesc implements Serializable { + + private static final long serialVersionUID = 1L; + + protected DynamicValue dynamicValue; + + public ExprNodeDynamicValueDesc() { + } + + public ExprNodeDynamicValueDesc(DynamicValue value) { + super(value.getTypeInfo()); + this.dynamicValue = value; + } + + @Override + public ExprNodeDesc clone() { + return new ExprNodeDynamicValueDesc(dynamicValue); + } + + @Override + public boolean isSame(Object o) { + if (o instanceof ExprNodeDynamicValueDesc) { + Object otherValue = ((ExprNodeDynamicValueDesc) o).getDynamicValue(); + if (dynamicValue == null) { + return otherValue == null; + } + return dynamicValue.equals(otherValue); + } + return false; + } + + public DynamicValue getDynamicValue() { + return dynamicValue; + } + + public void setValue(DynamicValue value) { + this.dynamicValue = value; + } + + @Override + public String getExprString() { + return dynamicValue != null ? dynamicValue.toString() : "null dynamic literal"; + } + + @Override + public String toString() { + return dynamicValue != null ? dynamicValue.toString() : "null dynamic literal"; + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFBloomFilter.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFBloomFilter.java new file mode 100644 index 0000000..fd95cc0 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFBloomFilter.java @@ -0,0 +1,260 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.udf.generic; + +import org.apache.hadoop.hive.common.type.HiveDecimal; +import org.apache.hadoop.hive.ql.exec.Operator; +import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.parse.SemanticException; +import org.apache.hadoop.hive.serde2.io.DateWritable; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.*; +import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; +import org.apache.hadoop.io.BytesWritable; +import org.apache.hadoop.io.Text; +import org.apache.hive.common.util.BloomFilter; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.sql.Timestamp; + +/** + * Generic UDF to generate Bloom Filter + */ +public class GenericUDAFBloomFilter implements GenericUDAFResolver2 { + + private static final Logger LOG = LoggerFactory.getLogger(GenericUDAFBloomFilter.class); + + @Override + public GenericUDAFEvaluator getEvaluator(GenericUDAFParameterInfo info) throws SemanticException { + return new GenericUDAFBloomFilterEvaluator(); + } + + @Override + public GenericUDAFEvaluator getEvaluator(TypeInfo[] parameters) throws SemanticException { + return new GenericUDAFBloomFilterEvaluator(); + } + + /** + * GenericUDAFBloomFilterEvaluator - Evaluator class for BloomFilter + */ + public static class GenericUDAFBloomFilterEvaluator extends GenericUDAFEvaluator { + // Source operator to get the number of entries + private Operator sourceOperator; + + // ObjectInspector for input data. + private PrimitiveObjectInspector inputOI; + + // Bloom filter rest + private ByteArrayOutputStream result = new ByteArrayOutputStream(); + + @Override + public ObjectInspector init(Mode m, ObjectInspector[] parameters) throws HiveException { + super.init(m, parameters); + + // Initialize input + if (mode == Mode.PARTIAL1 || mode == Mode.COMPLETE) { + inputOI = (PrimitiveObjectInspector) parameters[0]; + } else { + // Do nothing for other modes + } + + // Output will be same in both partial or full aggregation modes. + // It will be a BloomFilter in ByteWritable + return PrimitiveObjectInspectorFactory.writableBinaryObjectInspector; + } + + /** + * Class for storing the BloomFilter + */ + @AggregationType(estimable = true) + static class BloomFilterBuf extends AbstractAggregationBuffer { + BloomFilter bloomFilter; + + public BloomFilterBuf(long expectedEntries) { + bloomFilter = new BloomFilter(expectedEntries); + } + + @Override + public int estimate() { + return (int) bloomFilter.sizeInBytes(); + } + } + + @Override + public void reset(AggregationBuffer agg) throws HiveException { + ((BloomFilterBuf)agg).bloomFilter.reset(); + } + + @Override + public AggregationBuffer getNewAggregationBuffer() throws HiveException { + long expectedEntries = getExpectedEntries(); + if (expectedEntries < 0) { + throw new IllegalStateException("BloomFilter expectedEntries not initialized"); + } + + BloomFilterBuf buf = new BloomFilterBuf(expectedEntries); + reset(buf); + return buf; + } + + @Override + public void iterate(AggregationBuffer agg, Object[] parameters) throws HiveException { + if (parameters == null || parameters[0] == null) { + // 2nd condition occurs when the input has 0 rows (possible due to + // filtering, joins etc). + return; + } + + BloomFilter bf = ((BloomFilterBuf)agg).bloomFilter; + + // Add the expression into the BloomFilter + switch (((PrimitiveTypeInfo) TypeInfoFactory. + getPrimitiveTypeInfoFromPrimitiveWritable( + parameters[0].getClass())).getPrimitiveCategory()) { + case BOOLEAN: + boolean vBoolean = ((BooleanObjectInspector)inputOI).get(parameters[0]); + bf.addLong(vBoolean ? 1 : 0); + break; + case BYTE: + byte vByte = ((ByteObjectInspector)inputOI).get(parameters[0]); + bf.addLong(vByte); + break; + case SHORT: + short vShort = ((ShortObjectInspector)inputOI).get(parameters[0]); + bf.addLong(vShort); + break; + case INT: + int vInt = ((IntObjectInspector)inputOI).get(parameters[0]); + bf.addLong(vInt); + break; + case LONG: + long vLong = ((LongObjectInspector)inputOI).get(parameters[0]); + bf.addLong(vLong); + break; + case FLOAT: + float vFloat = ((FloatObjectInspector)inputOI).get(parameters[0]); + bf.addDouble(vFloat); + break; + case DOUBLE: + double vDouble = ((DoubleObjectInspector)inputOI).get(parameters[0]); + bf.addDouble(vDouble); + break; + case DECIMAL: + HiveDecimal vDecimal = ((HiveDecimalObjectInspector)inputOI). + getPrimitiveJavaObject(parameters[0]); + bf.addString(vDecimal.toString()); + break; + case DATE: + DateWritable vDate = ((DateObjectInspector)inputOI). + getPrimitiveWritableObject(parameters[0]); + bf.addLong(vDate.getDays()); + break; + case TIMESTAMP: + Timestamp vTimeStamp = ((TimestampObjectInspector)inputOI). + getPrimitiveJavaObject(parameters[0]); + bf.addLong(vTimeStamp.getTime()); + break; + case CHAR: + Text vChar = ((HiveCharObjectInspector)inputOI). + getPrimitiveWritableObject(parameters[0]).getStrippedValue(); + bf.addBytes(vChar.getBytes(), 0, vChar.getLength()); + break; + case VARCHAR: + Text vVarChar = ((HiveVarcharObjectInspector)inputOI). + getPrimitiveWritableObject(parameters[0]).getTextValue(); + bf.addBytes(vVarChar.getBytes(), 0, vVarChar.getLength()); + break; + case STRING: + Text vString = ((StringObjectInspector)inputOI). + getPrimitiveWritableObject(parameters[0]); + bf.addBytes(vString.getBytes(), 0, vString.getLength()); + break; + case BINARY: + BytesWritable vBytes = ((BinaryObjectInspector)inputOI). + getPrimitiveWritableObject(parameters[0]); + bf.addBytes(vBytes.getBytes(), 0, vBytes.getLength()); + break; + default: + throw new UDFArgumentTypeException(0, + "Bad primitive category " + inputOI.getPrimitiveCategory()); + } + } + + @Override + public void merge(AggregationBuffer agg, Object partial) throws HiveException { + if (partial == null) { + return; + } + + BytesWritable bytes = (BytesWritable) partial; + ByteArrayInputStream in = new ByteArrayInputStream(bytes.getBytes()); + // Deserialze the bloomfilter + try { + BloomFilter bf = BloomFilter.deserialize(in); + ((BloomFilterBuf)agg).bloomFilter.merge(bf); + } catch (IOException e) { + throw new HiveException(e); + } + } + + @Override + public Object terminate(AggregationBuffer agg) throws HiveException { + result.reset(); + try { + BloomFilter.serialize(result, ((BloomFilterBuf)agg).bloomFilter); + } catch (IOException e) { + throw new HiveException(e); + } + return new BytesWritable(result.toByteArray()); + } + + @Override + public Object terminatePartial(AggregationBuffer agg) throws HiveException { + return terminate(agg); + } + + public long getExpectedEntries() { + if (sourceOperator != null && sourceOperator.getStatistics() != null) { + return sourceOperator.getStatistics().getNumRows(); + } + return -1; + } + + public Operator getSourceOperator() { + return sourceOperator; + } + + public void setSourceOperator(Operator sourceOperator) { + this.sourceOperator = sourceOperator; + } + + @Override + public String getExprString() { + return "expectedEntires=" + getExpectedEntries(); + } + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFEvaluator.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFEvaluator.java index 18d5285..3a98276 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFEvaluator.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFEvaluator.java @@ -262,6 +262,13 @@ public GenericUDAFEvaluator getWindowingEvaluator(WindowFrameDef wFrmDef) { return null; } + /** + * Optional information to add to expression string. Subclasses can override. + */ + public String getExprString() { + return ""; + } + protected BasePartitionEvaluator partitionEvaluator; /** diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFInBloomFilter.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFInBloomFilter.java new file mode 100644 index 0000000..e91088e --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFInBloomFilter.java @@ -0,0 +1,167 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.udf.generic; + +import org.apache.hadoop.hive.common.type.HiveDecimal; +import org.apache.hadoop.hive.ql.exec.UDFArgumentException; +import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException; +import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.serde2.io.DateWritable; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.*; +import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import org.apache.hadoop.io.BytesWritable; +import org.apache.hadoop.io.Text; +import org.apache.hive.common.util.BloomFilter; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.sql.Timestamp; + +/** + * GenericUDF to lookup a value in BloomFilter + */ +public class GenericUDFInBloomFilter extends GenericUDF { + private static final Logger LOG = LoggerFactory.getLogger(GenericUDFInBloomFilter.class); + + private transient ObjectInspector valObjectInspector; + private transient ObjectInspector bloomFilterObjectInspector; + private transient BloomFilter bloomFilter; + private transient boolean initializedBloomFilter; + + @Override + public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException { + if (arguments.length != 2) { + throw new UDFArgumentLengthException( + "InBloomFilter requires exactly 2 arguments but got " + arguments.length); + } + + // Verify individual arguments + if (arguments[0].getCategory() != ObjectInspector.Category.PRIMITIVE) { + throw new UDFArgumentTypeException(0, "The 1st argument must be a primitive type but " + + arguments[0].getTypeName() + " was passed"); + } + + if (((PrimitiveObjectInspector) arguments[1]).getPrimitiveCategory() != + PrimitiveObjectInspector.PrimitiveCategory.BINARY) { + throw new UDFArgumentTypeException(1, "The 2nd argument must be a binary type but " + + arguments[1].getTypeName() + " was passed"); + } + + valObjectInspector = arguments[0]; + bloomFilterObjectInspector = arguments[1]; + + initializedBloomFilter = false; + return PrimitiveObjectInspectorFactory.javaBooleanObjectInspector; + } + + @Override + public String getDisplayString(String[] children) { + return getStandardDisplayString("in_bloom_filter", children); + } + + @Override + public Object evaluate(DeferredObject[] arguments) throws HiveException { + // Return if either of the arguments is null + if (arguments[0].get() == null || arguments[1].get() == null) { + return null; + } + + if (!initializedBloomFilter) { + // Setup the bloom filter once + try { + BytesWritable bw = (BytesWritable) arguments[1].get(); + byte[] bytes = new byte[bw.getLength()]; + System.arraycopy(bw.getBytes(), 0, bytes, 0, bw.getLength()); + bloomFilter = BloomFilter.deserialize(new ByteArrayInputStream(bytes)); + } catch ( IOException e) { + throw new HiveException(e); + } + initializedBloomFilter = true; + } + + // Check if the value is in bloom filter + switch (((PrimitiveObjectInspector)valObjectInspector). + getTypeInfo().getPrimitiveCategory()) { + case BOOLEAN: + boolean vBoolean = ((BooleanObjectInspector)valObjectInspector). + get(arguments[0].get()); + return bloomFilter.testLong(vBoolean ? 1 : 0); + case BYTE: + byte vByte = ((ByteObjectInspector) valObjectInspector). + get(arguments[0].get()); + return bloomFilter.testLong(vByte); + case SHORT: + short vShort = ((ShortObjectInspector) valObjectInspector). + get(arguments[0].get()); + return bloomFilter.testLong(vShort); + case INT: + int vInt = ((IntObjectInspector) valObjectInspector). + get(arguments[0].get()); + return bloomFilter.testLong(vInt); + case LONG: + long vLong = ((LongObjectInspector) valObjectInspector). + get(arguments[0].get()); + return bloomFilter.testLong(vLong); + case FLOAT: + float vFloat = ((FloatObjectInspector) valObjectInspector). + get(arguments[0].get()); + return bloomFilter.testDouble(vFloat); + case DOUBLE: + double vDouble = ((DoubleObjectInspector) valObjectInspector). + get(arguments[0].get()); + return bloomFilter.testDouble(vDouble); + case DECIMAL: + HiveDecimal vDecimal = ((HiveDecimalObjectInspector) valObjectInspector). + getPrimitiveJavaObject(arguments[0].get()); + return bloomFilter.testString(vDecimal.toString()); + case DATE: + DateWritable vDate = ((DateObjectInspector) valObjectInspector). + getPrimitiveWritableObject(arguments[0].get()); + return bloomFilter.testLong(vDate.getDays()); + case TIMESTAMP: + Timestamp vTimeStamp = ((TimestampObjectInspector) valObjectInspector). + getPrimitiveJavaObject(arguments[0].get()); + return bloomFilter.testLong(vTimeStamp.getTime()); + case CHAR: + Text vChar = ((HiveCharObjectInspector) valObjectInspector). + getPrimitiveWritableObject(arguments[0].get()).getStrippedValue(); + return bloomFilter.testBytes(vChar.getBytes(), 0, vChar.getLength()); + case VARCHAR: + Text vVarchar = ((HiveVarcharObjectInspector) valObjectInspector). + getPrimitiveWritableObject(arguments[0].get()).getTextValue(); + return bloomFilter.testBytes(vVarchar.getBytes(), 0, vVarchar.getLength()); + case STRING: + Text vString = ((StringObjectInspector) valObjectInspector). + getPrimitiveWritableObject(arguments[0].get()); + return bloomFilter.testBytes(vString.getBytes(), 0, vString.getLength()); + case BINARY: + BytesWritable vBytes = ((BinaryObjectInspector) valObjectInspector). + getPrimitiveWritableObject(arguments[0].get()); + return bloomFilter.testBytes(vBytes.getBytes(), 0, vBytes.getLength()); + default: + throw new UDFArgumentTypeException(0, "Bad primitive category " + + ((PrimitiveTypeInfo) valObjectInspector).getPrimitiveCategory()); + } + } +} diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/sarg/TestConvertAstToSearchArg.java b/ql/src/test/org/apache/hadoop/hive/ql/io/sarg/TestConvertAstToSearchArg.java index 93b50a6..6563290 100644 --- a/ql/src/test/org/apache/hadoop/hive/ql/io/sarg/TestConvertAstToSearchArg.java +++ b/ql/src/test/org/apache/hadoop/hive/ql/io/sarg/TestConvertAstToSearchArg.java @@ -28,6 +28,7 @@ import java.util.List; import java.util.Set; +import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hive.ql.exec.SerializationUtilities; import org.apache.hadoop.hive.ql.io.parquet.read.ParquetFilterPredicateConverter; import org.apache.hadoop.hive.ql.io.sarg.SearchArgument.TruthValue; @@ -44,6 +45,8 @@ */ public class TestConvertAstToSearchArg { + private final Configuration conf = new Configuration(); + private static void assertNoSharedNodes(ExpressionTree tree, Set seen ) throws Exception { @@ -547,7 +550,7 @@ public void testExpression1() throws Exception { " \n"; SearchArgumentImpl sarg = - (SearchArgumentImpl) ConvertAstToSearchArg.create(getFuncDesc(exprStr)); + (SearchArgumentImpl) ConvertAstToSearchArg.create(conf, getFuncDesc(exprStr)); List leaves = sarg.getLeaves(); assertEquals(9, leaves.size()); @@ -836,7 +839,7 @@ public void testExpression2() throws Exception { " \n"; SearchArgumentImpl sarg = - (SearchArgumentImpl) ConvertAstToSearchArg.create(getFuncDesc(exprStr)); + (SearchArgumentImpl) ConvertAstToSearchArg.create(conf, getFuncDesc(exprStr)); List leaves = sarg.getLeaves(); assertEquals(4, leaves.size()); @@ -1269,7 +1272,7 @@ public void testExpression3() throws Exception { " \n"; SearchArgumentImpl sarg = - (SearchArgumentImpl) ConvertAstToSearchArg.create(getFuncDesc(exprStr)); + (SearchArgumentImpl) ConvertAstToSearchArg.create(conf, getFuncDesc(exprStr)); List leaves = sarg.getLeaves(); assertEquals(3, leaves.size()); @@ -1493,7 +1496,7 @@ id in (34,50) */ "\n"; SearchArgumentImpl sarg = - (SearchArgumentImpl) ConvertAstToSearchArg.create(getFuncDesc(exprStr)); + (SearchArgumentImpl) ConvertAstToSearchArg.create(conf, getFuncDesc(exprStr)); List leaves = sarg.getLeaves(); assertEquals(3, leaves.size()); @@ -1763,7 +1766,7 @@ public void testExpression5() throws Exception { " \n"; SearchArgumentImpl sarg = - (SearchArgumentImpl) ConvertAstToSearchArg.create(getFuncDesc(exprStr)); + (SearchArgumentImpl) ConvertAstToSearchArg.create(conf, getFuncDesc(exprStr)); List leaves = sarg.getLeaves(); assertEquals(1, leaves.size()); @@ -2246,7 +2249,7 @@ public void testExpression7() throws Exception { ""; SearchArgumentImpl sarg = - (SearchArgumentImpl) ConvertAstToSearchArg.create(getFuncDesc(exprStr)); + (SearchArgumentImpl) ConvertAstToSearchArg.create(conf, getFuncDesc(exprStr)); List leaves = sarg.getLeaves(); assertEquals(9, leaves.size()); @@ -2405,7 +2408,7 @@ public void testExpression8() throws Exception { " "; SearchArgumentImpl sarg = - (SearchArgumentImpl) ConvertAstToSearchArg.create(getFuncDesc(exprStr)); + (SearchArgumentImpl) ConvertAstToSearchArg.create(conf, getFuncDesc(exprStr)); List leaves = sarg.getLeaves(); assertEquals(0, leaves.size()); @@ -2538,7 +2541,7 @@ public void testExpression9() throws Exception { " "; SearchArgumentImpl sarg = - (SearchArgumentImpl) ConvertAstToSearchArg.create(getFuncDesc(exprStr)); + (SearchArgumentImpl) ConvertAstToSearchArg.create(conf, getFuncDesc(exprStr)); List leaves = sarg.getLeaves(); assertEquals(0, leaves.size()); @@ -2663,7 +2666,7 @@ public void testExpression10() throws Exception { ""; SearchArgumentImpl sarg = - (SearchArgumentImpl) ConvertAstToSearchArg.create(getFuncDesc(exprStr)); + (SearchArgumentImpl) ConvertAstToSearchArg.create(conf, getFuncDesc(exprStr)); List leaves = sarg.getLeaves(); assertEquals(1, leaves.size()); @@ -2712,7 +2715,7 @@ public void TestTimestampSarg() throws Exception { "AAABgj0BRVFVQcwBBW9yZy5hcGFjaGUuaGFkb29wLmlvLkJvb2xlYW5Xcml0YWJs5Q" + "EAAAECAQFib29sZWHu"; SearchArgument sarg = - new ConvertAstToSearchArg(SerializationUtilities.deserializeExpression(serialAst)) + new ConvertAstToSearchArg(conf, SerializationUtilities.deserializeExpression(serialAst)) .buildSearchArgument(); assertEquals("leaf-0", sarg.getExpression().toString()); assertEquals(1, sarg.getLeaves().size()); @@ -2731,7 +2734,7 @@ public void TestDateSarg() throws Exception { "Y2hlLmhhZG9vcC5oaXZlLnFsLnVkZi5nZW5lcmljLkdlbmVyaWNVREZPUEVxdWHsAQAAAYI9AUVRVUH" + "MAQVvcmcuYXBhY2hlLmhhZG9vcC5pby5Cb29sZWFuV3JpdGFibOUBAAABAgEBYm9vbGVh7g=="; SearchArgument sarg = - new ConvertAstToSearchArg(SerializationUtilities.deserializeExpression(serialAst)) + new ConvertAstToSearchArg(conf, SerializationUtilities.deserializeExpression(serialAst)) .buildSearchArgument(); assertEquals("leaf-0", sarg.getExpression().toString()); assertEquals(1, sarg.getLeaves().size()); @@ -2751,7 +2754,7 @@ public void TestDecimalSarg() throws Exception { "oaXZlLnFsLnVkZi5nZW5lcmljLkdlbmVyaWNVREZPUEVxdWHsAQAAAYI9AUVRVUHMAQZvcmcuYXBhY2" + "hlLmhhZG9vcC5pby5Cb29sZWFuV3JpdGFibOUBAAABBAEBYm9vbGVh7g=="; SearchArgument sarg = - new ConvertAstToSearchArg(SerializationUtilities.deserializeExpression(serialAst)) + new ConvertAstToSearchArg(conf, SerializationUtilities.deserializeExpression(serialAst)) .buildSearchArgument(); assertEquals("leaf-0", sarg.getExpression().toString()); assertEquals(1, sarg.getLeaves().size()); @@ -2771,7 +2774,7 @@ public void TestCharSarg() throws Exception { "vb3AuaGl2ZS5xbC51ZGYuZ2VuZXJpYy5HZW5lcmljVURGT1BFcXVh7AEAAAGCPQFFUVVBzAEGb3JnLm" + "FwYWNoZS5oYWRvb3AuaW8uQm9vbGVhbldyaXRhYmzlAQAAAQQBAWJvb2xlYe4="; SearchArgument sarg = - new ConvertAstToSearchArg(SerializationUtilities.deserializeExpression(serialAst)) + new ConvertAstToSearchArg(conf, SerializationUtilities.deserializeExpression(serialAst)) .buildSearchArgument(); assertEquals("leaf-0", sarg.getExpression().toString()); assertEquals(1, sarg.getLeaves().size()); @@ -2791,7 +2794,7 @@ public void TestVarcharSarg() throws Exception { "lLmhhZG9vcC5oaXZlLnFsLnVkZi5nZW5lcmljLkdlbmVyaWNVREZPUEVxdWHsAQAAAYI9AUVRVUHMAQ" + "ZvcmcuYXBhY2hlLmhhZG9vcC5pby5Cb29sZWFuV3JpdGFibOUBAAABBAEBYm9vbGVh7g=="; SearchArgument sarg = - new ConvertAstToSearchArg(SerializationUtilities.deserializeExpression(serialAst)) + new ConvertAstToSearchArg(conf, SerializationUtilities.deserializeExpression(serialAst)) .buildSearchArgument(); assertEquals("leaf-0", sarg.getExpression().toString()); assertEquals(1, sarg.getLeaves().size()); @@ -2810,7 +2813,7 @@ public void TestBigintSarg() throws Exception { "dmUucWwudWRmLmdlbmVyaWMuR2VuZXJpY1VERk9QRXF1YewBAAABgj0BRVFVQcwBBW9yZy5hcGFjaGU" + "uaGFkb29wLmlvLkJvb2xlYW5Xcml0YWJs5QEAAAECAQFib29sZWHu"; SearchArgument sarg = - new ConvertAstToSearchArg(SerializationUtilities.deserializeExpression(serialAst)) + new ConvertAstToSearchArg(conf, SerializationUtilities.deserializeExpression(serialAst)) .buildSearchArgument(); assertEquals("leaf-0", sarg.getExpression().toString()); assertEquals(1, sarg.getLeaves().size()); @@ -2831,7 +2834,7 @@ public void TestBooleanSarg() throws Exception { "hlLmhhZG9vcC5pby5Cb29sZWFuV3JpdGFibOUBAAABAwkBAgEBYrIAAAgBAwkBB29yZy5hcGFjaGUua" + "GFkb29wLmhpdmUucWwudWRmLmdlbmVyaWMuR2VuZXJpY1VERk9QQW7kAQEGAQAAAQMJ"; SearchArgument sarg = - new ConvertAstToSearchArg(SerializationUtilities.deserializeExpression(serialAst)) + new ConvertAstToSearchArg(conf, SerializationUtilities.deserializeExpression(serialAst)) .buildSearchArgument(); assertEquals("(and leaf-0 leaf-1)", sarg.getExpression().toString()); assertEquals(2, sarg.getLeaves().size()); @@ -2853,7 +2856,7 @@ public void TestFloatSarg() throws Exception { "aXZlLnFsLnVkZi5nZW5lcmljLkdlbmVyaWNVREZPUEVxdWHsAQAAAYI9AUVRVUHMAQVvcmcuYXBhY2h" + "lLmhhZG9vcC5pby5Cb29sZWFuV3JpdGFibOUBAAABAgEBYm9vbGVh7g=="; SearchArgument sarg = - new ConvertAstToSearchArg(SerializationUtilities.deserializeExpression(serialAst)) + new ConvertAstToSearchArg(conf, SerializationUtilities.deserializeExpression(serialAst)) .buildSearchArgument(); assertEquals("leaf-0", sarg.getExpression().toString()); assertEquals(1, sarg.getLeaves().size()); @@ -2872,7 +2875,7 @@ public void TestDoubleSarg() throws Exception { "b29wLmhpdmUucWwudWRmLmdlbmVyaWMuR2VuZXJpY1VERk9QRXF1YewBAAABgj0BRVFVQcwBBW9yZy5" + "hcGFjaGUuaGFkb29wLmlvLkJvb2xlYW5Xcml0YWJs5QEAAAECAQFib29sZWHu"; SearchArgument sarg = - new ConvertAstToSearchArg(SerializationUtilities.deserializeExpression(serialAst)) + new ConvertAstToSearchArg(conf, SerializationUtilities.deserializeExpression(serialAst)) .buildSearchArgument(); assertEquals("leaf-0", sarg.getExpression().toString()); assertEquals(1, sarg.getLeaves().size()); diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/sarg/TestSearchArgumentImpl.java b/ql/src/test/org/apache/hadoop/hive/ql/io/sarg/TestSearchArgumentImpl.java index 8cbc26d..df42058 100644 --- a/ql/src/test/org/apache/hadoop/hive/ql/io/sarg/TestSearchArgumentImpl.java +++ b/ql/src/test/org/apache/hadoop/hive/ql/io/sarg/TestSearchArgumentImpl.java @@ -79,7 +79,7 @@ public static PredicateLeaf createPredicateLeaf(PredicateLeaf.Operator operator, Object literal, List literalList) { return new SearchArgumentImpl.PredicateLeafImpl(operator, type, columnName, - literal, literalList); + literal, literalList, null); } @Test diff --git a/ql/src/test/queries/clientpositive/dynamic_semijoin_reduction.q b/ql/src/test/queries/clientpositive/dynamic_semijoin_reduction.q new file mode 100644 index 0000000..10e7bf4 --- /dev/null +++ b/ql/src/test/queries/clientpositive/dynamic_semijoin_reduction.q @@ -0,0 +1,68 @@ +set hive.compute.query.using.stats=false; +set hive.mapred.mode=nonstrict; +set hive.explain.user=false; +set hive.optimize.ppd=true; +set hive.ppd.remove.duplicatefilters=true; +set hive.tez.dynamic.partition.pruning=true; +set hive.tez.dynamic.semijoin.reduction=true; +set hive.optimize.metadataonly=false; +set hive.optimize.index.filter=true; + +-- Create Tables +create table alltypesorc_int ( cint int, cstring string ) stored as ORC; +create table srcpart_date (key string, value string) partitioned by (ds string ) stored as ORC; +CREATE TABLE srcpart_small(key1 STRING, value1 STRING) partitioned by (ds string) STORED as ORC; + +-- Add Partitions +alter table srcpart_date add partition (ds = "2008-04-08"); +alter table srcpart_date add partition (ds = "2008-04-09"); + +alter table srcpart_small add partition (ds = "2008-04-08"); +alter table srcpart_small add partition (ds = "2008-04-09"); + +-- Load +insert overwrite table alltypesorc_int select cint, cstring1 from alltypesorc; +insert overwrite table srcpart_date partition (ds = "2008-04-08" ) select key, value from srcpart where ds = "2008-04-08"; +insert overwrite table srcpart_date partition (ds = "2008-04-09") select key, value from srcpart where ds = "2008-04-09"; +insert overwrite table srcpart_small partition (ds = "2008-04-09") select key, value from srcpart where ds = "2008-04-09"; +set hive.tez.dynamic.semijoin.reduction=false; + +-- single column, single key +EXPLAIN select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.key1); +select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.key1); +set hive.tez.dynamic.semijoin.reduction=false; +EXPLAIN select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.key1); +select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.key1); +set hive.tez.dynamic.semijoin.reduction=true; + +-- Mix dynamic partition pruning(DPP) and min/max bloom filter optimizations. Should pick the DPP. +EXPLAIN select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.ds); +select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.ds); +set hive.tez.dynamic.semijoin.reduction=false; + +--multiple sources, single key +EXPLAIN select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.key1) join alltypesorc_int on (srcpart_small.key1 = alltypesorc_int.cstring); +select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.key1) join alltypesorc_int on (srcpart_small.key1 = alltypesorc_int.cstring); +set hive.tez.dynamic.semijoin.reduction=true; +EXPLAIN select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.key1) join alltypesorc_int on (srcpart_small.key1 = alltypesorc_int.cstring); +select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.key1) join alltypesorc_int on (srcpart_small.key1 = alltypesorc_int.cstring); +set hive.tez.dynamic.semijoin.reduction=false; + +-- single source, multiple keys +EXPLAIN select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.key1 and srcpart_date.value = srcpart_small.value1); +select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.key1 and srcpart_date.value = srcpart_small.value1); +set hive.tez.dynamic.semijoin.reduction=true; +select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.key1 and srcpart_date.value = srcpart_small.value1); +EXPLAIN select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.key1 and srcpart_date.value = srcpart_small.value1); +set hive.tez.dynamic.semijoin.reduction=false; + +-- multiple sources, different keys +EXPLAIN select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.key1) join alltypesorc_int on (srcpart_date.value = alltypesorc_int.cstring); +select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.key1) join alltypesorc_int on (srcpart_date.value = alltypesorc_int.cstring); +set hive.tez.dynamic.semijoin.reduction=true; +EXPLAIN select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.key1) join alltypesorc_int on (srcpart_date.value = alltypesorc_int.cstring); +select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.key1) join alltypesorc_int on (srcpart_date.value = alltypesorc_int.cstring); + +drop table srcpart_date; +drop table srcpart_small; +drop table alltypesorc_int; diff --git a/ql/src/test/results/clientpositive/llap/dynamic_semijoin_reduction.q.out b/ql/src/test/results/clientpositive/llap/dynamic_semijoin_reduction.q.out new file mode 100644 index 0000000..504bbca --- /dev/null +++ b/ql/src/test/results/clientpositive/llap/dynamic_semijoin_reduction.q.out @@ -0,0 +1,1445 @@ +PREHOOK: query: -- Create Tables +create table alltypesorc_int ( cint int, cstring string ) stored as ORC +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@alltypesorc_int +POSTHOOK: query: -- Create Tables +create table alltypesorc_int ( cint int, cstring string ) stored as ORC +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@alltypesorc_int +PREHOOK: query: create table srcpart_date (key string, value string) partitioned by (ds string ) stored as ORC +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@srcpart_date +POSTHOOK: query: create table srcpart_date (key string, value string) partitioned by (ds string ) stored as ORC +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@srcpart_date +PREHOOK: query: CREATE TABLE srcpart_small(key1 STRING, value1 STRING) partitioned by (ds string) STORED as ORC +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@srcpart_small +POSTHOOK: query: CREATE TABLE srcpart_small(key1 STRING, value1 STRING) partitioned by (ds string) STORED as ORC +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@srcpart_small +PREHOOK: query: -- Add Partitions +alter table srcpart_date add partition (ds = "2008-04-08") +PREHOOK: type: ALTERTABLE_ADDPARTS +PREHOOK: Output: default@srcpart_date +POSTHOOK: query: -- Add Partitions +alter table srcpart_date add partition (ds = "2008-04-08") +POSTHOOK: type: ALTERTABLE_ADDPARTS +POSTHOOK: Output: default@srcpart_date +POSTHOOK: Output: default@srcpart_date@ds=2008-04-08 +PREHOOK: query: alter table srcpart_date add partition (ds = "2008-04-09") +PREHOOK: type: ALTERTABLE_ADDPARTS +PREHOOK: Output: default@srcpart_date +POSTHOOK: query: alter table srcpart_date add partition (ds = "2008-04-09") +POSTHOOK: type: ALTERTABLE_ADDPARTS +POSTHOOK: Output: default@srcpart_date +POSTHOOK: Output: default@srcpart_date@ds=2008-04-09 +PREHOOK: query: alter table srcpart_small add partition (ds = "2008-04-08") +PREHOOK: type: ALTERTABLE_ADDPARTS +PREHOOK: Output: default@srcpart_small +POSTHOOK: query: alter table srcpart_small add partition (ds = "2008-04-08") +POSTHOOK: type: ALTERTABLE_ADDPARTS +POSTHOOK: Output: default@srcpart_small +POSTHOOK: Output: default@srcpart_small@ds=2008-04-08 +PREHOOK: query: alter table srcpart_small add partition (ds = "2008-04-09") +PREHOOK: type: ALTERTABLE_ADDPARTS +PREHOOK: Output: default@srcpart_small +POSTHOOK: query: alter table srcpart_small add partition (ds = "2008-04-09") +POSTHOOK: type: ALTERTABLE_ADDPARTS +POSTHOOK: Output: default@srcpart_small +POSTHOOK: Output: default@srcpart_small@ds=2008-04-09 +PREHOOK: query: -- Load +insert overwrite table alltypesorc_int select cint, cstring1 from alltypesorc +PREHOOK: type: QUERY +PREHOOK: Input: default@alltypesorc +PREHOOK: Output: default@alltypesorc_int +POSTHOOK: query: -- Load +insert overwrite table alltypesorc_int select cint, cstring1 from alltypesorc +POSTHOOK: type: QUERY +POSTHOOK: Input: default@alltypesorc +POSTHOOK: Output: default@alltypesorc_int +POSTHOOK: Lineage: alltypesorc_int.cint SIMPLE [(alltypesorc)alltypesorc.FieldSchema(name:cint, type:int, comment:null), ] +POSTHOOK: Lineage: alltypesorc_int.cstring SIMPLE [(alltypesorc)alltypesorc.FieldSchema(name:cstring1, type:string, comment:null), ] +PREHOOK: query: insert overwrite table srcpart_date partition (ds = "2008-04-08" ) select key, value from srcpart where ds = "2008-04-08" +PREHOOK: type: QUERY +PREHOOK: Input: default@srcpart +PREHOOK: Input: default@srcpart@ds=2008-04-08/hr=11 +PREHOOK: Input: default@srcpart@ds=2008-04-08/hr=12 +PREHOOK: Output: default@srcpart_date@ds=2008-04-08 +POSTHOOK: query: insert overwrite table srcpart_date partition (ds = "2008-04-08" ) select key, value from srcpart where ds = "2008-04-08" +POSTHOOK: type: QUERY +POSTHOOK: Input: default@srcpart +POSTHOOK: Input: default@srcpart@ds=2008-04-08/hr=11 +POSTHOOK: Input: default@srcpart@ds=2008-04-08/hr=12 +POSTHOOK: Output: default@srcpart_date@ds=2008-04-08 +POSTHOOK: Lineage: srcpart_date PARTITION(ds=2008-04-08).key SIMPLE [(srcpart)srcpart.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: srcpart_date PARTITION(ds=2008-04-08).value SIMPLE [(srcpart)srcpart.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: insert overwrite table srcpart_date partition (ds = "2008-04-09") select key, value from srcpart where ds = "2008-04-09" +PREHOOK: type: QUERY +PREHOOK: Input: default@srcpart +PREHOOK: Input: default@srcpart@ds=2008-04-09/hr=11 +PREHOOK: Input: default@srcpart@ds=2008-04-09/hr=12 +PREHOOK: Output: default@srcpart_date@ds=2008-04-09 +POSTHOOK: query: insert overwrite table srcpart_date partition (ds = "2008-04-09") select key, value from srcpart where ds = "2008-04-09" +POSTHOOK: type: QUERY +POSTHOOK: Input: default@srcpart +POSTHOOK: Input: default@srcpart@ds=2008-04-09/hr=11 +POSTHOOK: Input: default@srcpart@ds=2008-04-09/hr=12 +POSTHOOK: Output: default@srcpart_date@ds=2008-04-09 +POSTHOOK: Lineage: srcpart_date PARTITION(ds=2008-04-09).key SIMPLE [(srcpart)srcpart.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: srcpart_date PARTITION(ds=2008-04-09).value SIMPLE [(srcpart)srcpart.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: insert overwrite table srcpart_small partition (ds = "2008-04-09") select key, value from srcpart where ds = "2008-04-09" +PREHOOK: type: QUERY +PREHOOK: Input: default@srcpart +PREHOOK: Input: default@srcpart@ds=2008-04-09/hr=11 +PREHOOK: Input: default@srcpart@ds=2008-04-09/hr=12 +PREHOOK: Output: default@srcpart_small@ds=2008-04-09 +POSTHOOK: query: insert overwrite table srcpart_small partition (ds = "2008-04-09") select key, value from srcpart where ds = "2008-04-09" +POSTHOOK: type: QUERY +POSTHOOK: Input: default@srcpart +POSTHOOK: Input: default@srcpart@ds=2008-04-09/hr=11 +POSTHOOK: Input: default@srcpart@ds=2008-04-09/hr=12 +POSTHOOK: Output: default@srcpart_small@ds=2008-04-09 +POSTHOOK: Lineage: srcpart_small PARTITION(ds=2008-04-09).key1 SIMPLE [(srcpart)srcpart.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: srcpart_small PARTITION(ds=2008-04-09).value1 SIMPLE [(srcpart)srcpart.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: -- single column, single key +EXPLAIN select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.key1) +PREHOOK: type: QUERY +POSTHOOK: query: -- single column, single key +EXPLAIN select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.key1) +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 4 (SIMPLE_EDGE) + Reducer 3 <- Reducer 2 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: srcpart_date + filterExpr: key is not null (type: boolean) + Statistics: Num rows: 2000 Data size: 368000 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: key is not null (type: boolean) + Statistics: Num rows: 2000 Data size: 368000 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 2000 Data size: 368000 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 2000 Data size: 368000 Basic stats: COMPLETE Column stats: NONE + Execution mode: llap + LLAP IO: all inputs + Map 4 + Map Operator Tree: + TableScan + alias: srcpart_small + filterExpr: key1 is not null (type: boolean) + Statistics: Num rows: 1000 Data size: 184000 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: key1 is not null (type: boolean) + Statistics: Num rows: 1000 Data size: 184000 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key1 (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 1000 Data size: 184000 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 1000 Data size: 184000 Basic stats: COMPLETE Column stats: NONE + Execution mode: llap + LLAP IO: all inputs + Reducer 2 + Execution mode: llap + Reduce Operator Tree: + Merge Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 _col0 (type: string) + 1 _col0 (type: string) + Statistics: Num rows: 2200 Data size: 404800 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count() + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: bigint) + Reducer 3 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.key1) +PREHOOK: type: QUERY +PREHOOK: Input: default@srcpart_date +PREHOOK: Input: default@srcpart_date@ds=2008-04-08 +PREHOOK: Input: default@srcpart_date@ds=2008-04-09 +PREHOOK: Input: default@srcpart_small +PREHOOK: Input: default@srcpart_small@ds=2008-04-08 +PREHOOK: Input: default@srcpart_small@ds=2008-04-09 +#### A masked pattern was here #### +POSTHOOK: query: select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.key1) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@srcpart_date +POSTHOOK: Input: default@srcpart_date@ds=2008-04-08 +POSTHOOK: Input: default@srcpart_date@ds=2008-04-09 +POSTHOOK: Input: default@srcpart_small +POSTHOOK: Input: default@srcpart_small@ds=2008-04-08 +POSTHOOK: Input: default@srcpart_small@ds=2008-04-09 +#### A masked pattern was here #### +8224 +PREHOOK: query: EXPLAIN select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.key1) +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.key1) +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 4 (SIMPLE_EDGE) + Reducer 3 <- Reducer 2 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: srcpart_date + filterExpr: key is not null (type: boolean) + Statistics: Num rows: 2000 Data size: 368000 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: key is not null (type: boolean) + Statistics: Num rows: 2000 Data size: 368000 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 2000 Data size: 368000 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 2000 Data size: 368000 Basic stats: COMPLETE Column stats: NONE + Execution mode: llap + LLAP IO: all inputs + Map 4 + Map Operator Tree: + TableScan + alias: srcpart_small + filterExpr: key1 is not null (type: boolean) + Statistics: Num rows: 1000 Data size: 184000 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: key1 is not null (type: boolean) + Statistics: Num rows: 1000 Data size: 184000 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key1 (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 1000 Data size: 184000 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 1000 Data size: 184000 Basic stats: COMPLETE Column stats: NONE + Execution mode: llap + LLAP IO: all inputs + Reducer 2 + Execution mode: llap + Reduce Operator Tree: + Merge Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 _col0 (type: string) + 1 _col0 (type: string) + Statistics: Num rows: 2200 Data size: 404800 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count() + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: bigint) + Reducer 3 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.key1) +PREHOOK: type: QUERY +PREHOOK: Input: default@srcpart_date +PREHOOK: Input: default@srcpart_date@ds=2008-04-08 +PREHOOK: Input: default@srcpart_date@ds=2008-04-09 +PREHOOK: Input: default@srcpart_small +PREHOOK: Input: default@srcpart_small@ds=2008-04-08 +PREHOOK: Input: default@srcpart_small@ds=2008-04-09 +#### A masked pattern was here #### +POSTHOOK: query: select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.key1) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@srcpart_date +POSTHOOK: Input: default@srcpart_date@ds=2008-04-08 +POSTHOOK: Input: default@srcpart_date@ds=2008-04-09 +POSTHOOK: Input: default@srcpart_small +POSTHOOK: Input: default@srcpart_small@ds=2008-04-08 +POSTHOOK: Input: default@srcpart_small@ds=2008-04-09 +#### A masked pattern was here #### +8224 +PREHOOK: query: -- Mix dynamic partition pruning(DPP) and min/max bloom filter optimizations. Should pick the DPP. +EXPLAIN select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.ds) +PREHOOK: type: QUERY +POSTHOOK: query: -- Mix dynamic partition pruning(DPP) and min/max bloom filter optimizations. Should pick the DPP. +EXPLAIN select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.ds) +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 4 (SIMPLE_EDGE) + Reducer 3 <- Reducer 2 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: srcpart_date + filterExpr: key is not null (type: boolean) + Statistics: Num rows: 2000 Data size: 368000 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: key is not null (type: boolean) + Statistics: Num rows: 2000 Data size: 368000 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 2000 Data size: 368000 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 2000 Data size: 368000 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 2000 Data size: 368000 Basic stats: COMPLETE Column stats: NONE + Group By Operator + keys: _col0 (type: string) + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 2000 Data size: 368000 Basic stats: COMPLETE Column stats: NONE + Dynamic Partitioning Event Operator + Target column: ds (string) + Target Input: srcpart_small + Partition key expr: ds + Statistics: Num rows: 2000 Data size: 368000 Basic stats: COMPLETE Column stats: NONE + Target Vertex: Map 4 + Execution mode: llap + LLAP IO: all inputs + Map 4 + Map Operator Tree: + TableScan + alias: srcpart_small + filterExpr: ds is not null (type: boolean) + Statistics: Num rows: 1000 Data size: 360000 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: ds (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 1000 Data size: 184000 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 1000 Data size: 184000 Basic stats: COMPLETE Column stats: COMPLETE + Execution mode: llap + LLAP IO: all inputs + Reducer 2 + Execution mode: llap + Reduce Operator Tree: + Merge Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 _col0 (type: string) + 1 _col0 (type: string) + Statistics: Num rows: 2200 Data size: 404800 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count() + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: bigint) + Reducer 3 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.ds) +PREHOOK: type: QUERY +PREHOOK: Input: default@srcpart_date +PREHOOK: Input: default@srcpart_date@ds=2008-04-08 +PREHOOK: Input: default@srcpart_date@ds=2008-04-09 +PREHOOK: Input: default@srcpart_small +PREHOOK: Input: default@srcpart_small@ds=2008-04-08 +PREHOOK: Input: default@srcpart_small@ds=2008-04-09 +#### A masked pattern was here #### +POSTHOOK: query: select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.ds) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@srcpart_date +POSTHOOK: Input: default@srcpart_date@ds=2008-04-08 +POSTHOOK: Input: default@srcpart_date@ds=2008-04-09 +POSTHOOK: Input: default@srcpart_small +POSTHOOK: Input: default@srcpart_small@ds=2008-04-08 +POSTHOOK: Input: default@srcpart_small@ds=2008-04-09 +#### A masked pattern was here #### +0 +PREHOOK: query: --multiple sources, single key +EXPLAIN select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.key1) join alltypesorc_int on (srcpart_small.key1 = alltypesorc_int.cstring) +PREHOOK: type: QUERY +POSTHOOK: query: --multiple sources, single key +EXPLAIN select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.key1) join alltypesorc_int on (srcpart_small.key1 = alltypesorc_int.cstring) +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 4 (SIMPLE_EDGE), Map 5 (SIMPLE_EDGE) + Reducer 3 <- Reducer 2 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: srcpart_date + filterExpr: key is not null (type: boolean) + Statistics: Num rows: 2000 Data size: 368000 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: key is not null (type: boolean) + Statistics: Num rows: 2000 Data size: 368000 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 2000 Data size: 368000 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 2000 Data size: 368000 Basic stats: COMPLETE Column stats: NONE + Execution mode: llap + LLAP IO: all inputs + Map 4 + Map Operator Tree: + TableScan + alias: srcpart_small + filterExpr: key1 is not null (type: boolean) + Statistics: Num rows: 1000 Data size: 184000 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: key1 is not null (type: boolean) + Statistics: Num rows: 1000 Data size: 184000 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key1 (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 1000 Data size: 184000 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 1000 Data size: 184000 Basic stats: COMPLETE Column stats: NONE + Execution mode: llap + LLAP IO: all inputs + Map 5 + Map Operator Tree: + TableScan + alias: alltypesorc_int + filterExpr: cstring is not null (type: boolean) + Statistics: Num rows: 12288 Data size: 926570 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: cstring is not null (type: boolean) + Statistics: Num rows: 12288 Data size: 926570 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: cstring (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 12288 Data size: 926570 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 12288 Data size: 926570 Basic stats: COMPLETE Column stats: NONE + Execution mode: llap + LLAP IO: all inputs + Reducer 2 + Execution mode: llap + Reduce Operator Tree: + Merge Join Operator + condition map: + Inner Join 0 to 1 + Inner Join 1 to 2 + keys: + 0 _col0 (type: string) + 1 _col0 (type: string) + 2 _col0 (type: string) + Statistics: Num rows: 27033 Data size: 2038454 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count() + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: bigint) + Reducer 3 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.key1) join alltypesorc_int on (srcpart_small.key1 = alltypesorc_int.cstring) +PREHOOK: type: QUERY +PREHOOK: Input: default@alltypesorc_int +PREHOOK: Input: default@srcpart_date +PREHOOK: Input: default@srcpart_date@ds=2008-04-08 +PREHOOK: Input: default@srcpart_date@ds=2008-04-09 +PREHOOK: Input: default@srcpart_small +PREHOOK: Input: default@srcpart_small@ds=2008-04-08 +PREHOOK: Input: default@srcpart_small@ds=2008-04-09 +#### A masked pattern was here #### +POSTHOOK: query: select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.key1) join alltypesorc_int on (srcpart_small.key1 = alltypesorc_int.cstring) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@alltypesorc_int +POSTHOOK: Input: default@srcpart_date +POSTHOOK: Input: default@srcpart_date@ds=2008-04-08 +POSTHOOK: Input: default@srcpart_date@ds=2008-04-09 +POSTHOOK: Input: default@srcpart_small +POSTHOOK: Input: default@srcpart_small@ds=2008-04-08 +POSTHOOK: Input: default@srcpart_small@ds=2008-04-09 +#### A masked pattern was here #### +48 +PREHOOK: query: EXPLAIN select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.key1) join alltypesorc_int on (srcpart_small.key1 = alltypesorc_int.cstring) +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.key1) join alltypesorc_int on (srcpart_small.key1 = alltypesorc_int.cstring) +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Map 5 <- Reducer 4 (BROADCAST_EDGE) + Map 7 <- Reducer 6 (BROADCAST_EDGE) + Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 5 (SIMPLE_EDGE), Map 7 (SIMPLE_EDGE) + Reducer 3 <- Reducer 2 (SIMPLE_EDGE) + Reducer 4 <- Map 1 (SIMPLE_EDGE) + Reducer 6 <- Map 5 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: srcpart_date + filterExpr: key is not null (type: boolean) + Statistics: Num rows: 2000 Data size: 368000 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: key is not null (type: boolean) + Statistics: Num rows: 500 Data size: 92000 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 500 Data size: 92000 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 500 Data size: 92000 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 500 Data size: 92000 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: min(_col0), max(_col0), bloom_filter(_col0, expectedEntires=500) + mode: hash + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: string), _col1 (type: string), _col2 (type: binary) + Execution mode: llap + LLAP IO: all inputs + Map 5 + Map Operator Tree: + TableScan + alias: srcpart_small + filterExpr: (key1 is not null and key1 BETWEEN DynamicValue(RS_9_srcpart_date_key1_min) AND DynamicValue(RS_9_srcpart_date_key1_max) and in_bloom_filter(key1, DynamicValue(RS_9_srcpart_date_key1_bloom_filter))) (type: boolean) + Statistics: Num rows: 1000 Data size: 184000 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (key1 is not null and key1 BETWEEN DynamicValue(RS_9_srcpart_date_key1_min) AND DynamicValue(RS_9_srcpart_date_key1_max) and in_bloom_filter(key1, DynamicValue(RS_9_srcpart_date_key1_bloom_filter))) (type: boolean) + Statistics: Num rows: 250 Data size: 46000 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key1 (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 250 Data size: 46000 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 250 Data size: 46000 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 250 Data size: 46000 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: min(_col0), max(_col0), bloom_filter(_col0, expectedEntires=250) + mode: hash + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: string), _col1 (type: string), _col2 (type: binary) + Execution mode: llap + LLAP IO: all inputs + Map 7 + Map Operator Tree: + TableScan + alias: alltypesorc_int + filterExpr: (cstring is not null and cstring BETWEEN DynamicValue(RS_10_srcpart_small_cstring_min) AND DynamicValue(RS_10_srcpart_small_cstring_max) and in_bloom_filter(cstring, DynamicValue(RS_10_srcpart_small_cstring_bloom_filter))) (type: boolean) + Statistics: Num rows: 12288 Data size: 926570 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (cstring is not null and cstring BETWEEN DynamicValue(RS_10_srcpart_small_cstring_min) AND DynamicValue(RS_10_srcpart_small_cstring_max) and in_bloom_filter(cstring, DynamicValue(RS_10_srcpart_small_cstring_bloom_filter))) (type: boolean) + Statistics: Num rows: 3072 Data size: 231642 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: cstring (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 3072 Data size: 231642 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 3072 Data size: 231642 Basic stats: COMPLETE Column stats: NONE + Execution mode: llap + LLAP IO: all inputs + Reducer 2 + Execution mode: llap + Reduce Operator Tree: + Merge Join Operator + condition map: + Inner Join 0 to 1 + Inner Join 1 to 2 + keys: + 0 _col0 (type: string) + 1 _col0 (type: string) + 2 _col0 (type: string) + Statistics: Num rows: 6758 Data size: 509612 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count() + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: bigint) + Reducer 3 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Reducer 4 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: min(VALUE._col0), max(VALUE._col1), bloom_filter(VALUE._col2, expectedEntires=500) + mode: final + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: string), _col1 (type: string), _col2 (type: binary) + Reducer 6 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: min(VALUE._col0), max(VALUE._col1), bloom_filter(VALUE._col2, expectedEntires=250) + mode: final + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: string), _col1 (type: string), _col2 (type: binary) + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.key1) join alltypesorc_int on (srcpart_small.key1 = alltypesorc_int.cstring) +PREHOOK: type: QUERY +PREHOOK: Input: default@alltypesorc_int +PREHOOK: Input: default@srcpart_date +PREHOOK: Input: default@srcpart_date@ds=2008-04-08 +PREHOOK: Input: default@srcpart_date@ds=2008-04-09 +PREHOOK: Input: default@srcpart_small +PREHOOK: Input: default@srcpart_small@ds=2008-04-08 +PREHOOK: Input: default@srcpart_small@ds=2008-04-09 +#### A masked pattern was here #### +POSTHOOK: query: select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.key1) join alltypesorc_int on (srcpart_small.key1 = alltypesorc_int.cstring) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@alltypesorc_int +POSTHOOK: Input: default@srcpart_date +POSTHOOK: Input: default@srcpart_date@ds=2008-04-08 +POSTHOOK: Input: default@srcpart_date@ds=2008-04-09 +POSTHOOK: Input: default@srcpart_small +POSTHOOK: Input: default@srcpart_small@ds=2008-04-08 +POSTHOOK: Input: default@srcpart_small@ds=2008-04-09 +#### A masked pattern was here #### +48 +PREHOOK: query: -- single source, multiple keys +EXPLAIN select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.key1 and srcpart_date.value = srcpart_small.value1) +PREHOOK: type: QUERY +POSTHOOK: query: -- single source, multiple keys +EXPLAIN select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.key1 and srcpart_date.value = srcpart_small.value1) +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 4 (SIMPLE_EDGE) + Reducer 3 <- Reducer 2 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: srcpart_date + filterExpr: (key is not null and value is not null) (type: boolean) + Statistics: Num rows: 2000 Data size: 368000 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (key is not null and value is not null) (type: boolean) + Statistics: Num rows: 2000 Data size: 368000 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string), value (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 2000 Data size: 368000 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string), _col1 (type: string) + sort order: ++ + Map-reduce partition columns: _col0 (type: string), _col1 (type: string) + Statistics: Num rows: 2000 Data size: 368000 Basic stats: COMPLETE Column stats: NONE + Execution mode: llap + LLAP IO: all inputs + Map 4 + Map Operator Tree: + TableScan + alias: srcpart_small + filterExpr: (key1 is not null and value1 is not null) (type: boolean) + Statistics: Num rows: 1000 Data size: 184000 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (key1 is not null and value1 is not null) (type: boolean) + Statistics: Num rows: 1000 Data size: 184000 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key1 (type: string), value1 (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1000 Data size: 184000 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string), _col1 (type: string) + sort order: ++ + Map-reduce partition columns: _col0 (type: string), _col1 (type: string) + Statistics: Num rows: 1000 Data size: 184000 Basic stats: COMPLETE Column stats: NONE + Execution mode: llap + LLAP IO: all inputs + Reducer 2 + Execution mode: llap + Reduce Operator Tree: + Merge Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 _col0 (type: string), _col1 (type: string) + 1 _col0 (type: string), _col1 (type: string) + Statistics: Num rows: 2200 Data size: 404800 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count() + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: bigint) + Reducer 3 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.key1 and srcpart_date.value = srcpart_small.value1) +PREHOOK: type: QUERY +PREHOOK: Input: default@srcpart_date +PREHOOK: Input: default@srcpart_date@ds=2008-04-08 +PREHOOK: Input: default@srcpart_date@ds=2008-04-09 +PREHOOK: Input: default@srcpart_small +PREHOOK: Input: default@srcpart_small@ds=2008-04-08 +PREHOOK: Input: default@srcpart_small@ds=2008-04-09 +#### A masked pattern was here #### +POSTHOOK: query: select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.key1 and srcpart_date.value = srcpart_small.value1) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@srcpart_date +POSTHOOK: Input: default@srcpart_date@ds=2008-04-08 +POSTHOOK: Input: default@srcpart_date@ds=2008-04-09 +POSTHOOK: Input: default@srcpart_small +POSTHOOK: Input: default@srcpart_small@ds=2008-04-08 +POSTHOOK: Input: default@srcpart_small@ds=2008-04-09 +#### A masked pattern was here #### +8224 +PREHOOK: query: select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.key1 and srcpart_date.value = srcpart_small.value1) +PREHOOK: type: QUERY +PREHOOK: Input: default@srcpart_date +PREHOOK: Input: default@srcpart_date@ds=2008-04-08 +PREHOOK: Input: default@srcpart_date@ds=2008-04-09 +PREHOOK: Input: default@srcpart_small +PREHOOK: Input: default@srcpart_small@ds=2008-04-08 +PREHOOK: Input: default@srcpart_small@ds=2008-04-09 +#### A masked pattern was here #### +POSTHOOK: query: select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.key1 and srcpart_date.value = srcpart_small.value1) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@srcpart_date +POSTHOOK: Input: default@srcpart_date@ds=2008-04-08 +POSTHOOK: Input: default@srcpart_date@ds=2008-04-09 +POSTHOOK: Input: default@srcpart_small +POSTHOOK: Input: default@srcpart_small@ds=2008-04-08 +POSTHOOK: Input: default@srcpart_small@ds=2008-04-09 +#### A masked pattern was here #### +8224 +PREHOOK: query: EXPLAIN select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.key1 and srcpart_date.value = srcpart_small.value1) +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.key1 and srcpart_date.value = srcpart_small.value1) +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Map 5 <- Reducer 4 (BROADCAST_EDGE) + Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 5 (SIMPLE_EDGE) + Reducer 3 <- Reducer 2 (SIMPLE_EDGE) + Reducer 4 <- Map 1 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: srcpart_date + filterExpr: (key is not null and value is not null) (type: boolean) + Statistics: Num rows: 2000 Data size: 368000 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (key is not null and value is not null) (type: boolean) + Statistics: Num rows: 500 Data size: 92000 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string), value (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 500 Data size: 92000 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string), _col1 (type: string) + sort order: ++ + Map-reduce partition columns: _col0 (type: string), _col1 (type: string) + Statistics: Num rows: 500 Data size: 92000 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 500 Data size: 92000 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: min(_col0), max(_col0), bloom_filter(_col0, expectedEntires=500) + mode: hash + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: string), _col1 (type: string), _col2 (type: binary) + Execution mode: llap + LLAP IO: all inputs + Map 5 + Map Operator Tree: + TableScan + alias: srcpart_small + filterExpr: (key1 is not null and value1 is not null and key1 BETWEEN DynamicValue(RS_6_srcpart_date_key1_min) AND DynamicValue(RS_6_srcpart_date_key1_max) and in_bloom_filter(key1, DynamicValue(RS_6_srcpart_date_key1_bloom_filter))) (type: boolean) + Statistics: Num rows: 1000 Data size: 184000 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (key1 is not null and value1 is not null and key1 BETWEEN DynamicValue(RS_6_srcpart_date_key1_min) AND DynamicValue(RS_6_srcpart_date_key1_max) and in_bloom_filter(key1, DynamicValue(RS_6_srcpart_date_key1_bloom_filter))) (type: boolean) + Statistics: Num rows: 250 Data size: 46000 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key1 (type: string), value1 (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 250 Data size: 46000 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string), _col1 (type: string) + sort order: ++ + Map-reduce partition columns: _col0 (type: string), _col1 (type: string) + Statistics: Num rows: 250 Data size: 46000 Basic stats: COMPLETE Column stats: NONE + Execution mode: llap + LLAP IO: all inputs + Reducer 2 + Execution mode: llap + Reduce Operator Tree: + Merge Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 _col0 (type: string), _col1 (type: string) + 1 _col0 (type: string), _col1 (type: string) + Statistics: Num rows: 550 Data size: 101200 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count() + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: bigint) + Reducer 3 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Reducer 4 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: min(VALUE._col0), max(VALUE._col1), bloom_filter(VALUE._col2, expectedEntires=500) + mode: final + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: string), _col1 (type: string), _col2 (type: binary) + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: -- multiple sources, different keys +EXPLAIN select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.key1) join alltypesorc_int on (srcpart_date.value = alltypesorc_int.cstring) +PREHOOK: type: QUERY +POSTHOOK: query: -- multiple sources, different keys +EXPLAIN select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.key1) join alltypesorc_int on (srcpart_date.value = alltypesorc_int.cstring) +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 5 (SIMPLE_EDGE) + Reducer 3 <- Map 6 (SIMPLE_EDGE), Reducer 2 (SIMPLE_EDGE) + Reducer 4 <- Reducer 3 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: srcpart_date + filterExpr: (key is not null and value is not null) (type: boolean) + Statistics: Num rows: 2000 Data size: 368000 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (key is not null and value is not null) (type: boolean) + Statistics: Num rows: 2000 Data size: 368000 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string), value (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 2000 Data size: 368000 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 2000 Data size: 368000 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: string) + Execution mode: llap + LLAP IO: all inputs + Map 5 + Map Operator Tree: + TableScan + alias: srcpart_small + filterExpr: key1 is not null (type: boolean) + Statistics: Num rows: 1000 Data size: 184000 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: key1 is not null (type: boolean) + Statistics: Num rows: 1000 Data size: 184000 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key1 (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 1000 Data size: 184000 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 1000 Data size: 184000 Basic stats: COMPLETE Column stats: NONE + Execution mode: llap + LLAP IO: all inputs + Map 6 + Map Operator Tree: + TableScan + alias: alltypesorc_int + filterExpr: cstring is not null (type: boolean) + Statistics: Num rows: 12288 Data size: 926570 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: cstring is not null (type: boolean) + Statistics: Num rows: 12288 Data size: 926570 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: cstring (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 12288 Data size: 926570 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 12288 Data size: 926570 Basic stats: COMPLETE Column stats: NONE + Execution mode: llap + LLAP IO: all inputs + Reducer 2 + Execution mode: llap + Reduce Operator Tree: + Merge Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 _col0 (type: string) + 1 _col0 (type: string) + outputColumnNames: _col1 + Statistics: Num rows: 2200 Data size: 404800 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col1 (type: string) + sort order: + + Map-reduce partition columns: _col1 (type: string) + Statistics: Num rows: 2200 Data size: 404800 Basic stats: COMPLETE Column stats: NONE + Reducer 3 + Execution mode: llap + Reduce Operator Tree: + Merge Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 _col1 (type: string) + 1 _col0 (type: string) + Statistics: Num rows: 13516 Data size: 1019227 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count() + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: bigint) + Reducer 4 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.key1) join alltypesorc_int on (srcpart_date.value = alltypesorc_int.cstring) +PREHOOK: type: QUERY +PREHOOK: Input: default@alltypesorc_int +PREHOOK: Input: default@srcpart_date +PREHOOK: Input: default@srcpart_date@ds=2008-04-08 +PREHOOK: Input: default@srcpart_date@ds=2008-04-09 +PREHOOK: Input: default@srcpart_small +PREHOOK: Input: default@srcpart_small@ds=2008-04-08 +PREHOOK: Input: default@srcpart_small@ds=2008-04-09 +#### A masked pattern was here #### +POSTHOOK: query: select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.key1) join alltypesorc_int on (srcpart_date.value = alltypesorc_int.cstring) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@alltypesorc_int +POSTHOOK: Input: default@srcpart_date +POSTHOOK: Input: default@srcpart_date@ds=2008-04-08 +POSTHOOK: Input: default@srcpart_date@ds=2008-04-09 +POSTHOOK: Input: default@srcpart_small +POSTHOOK: Input: default@srcpart_small@ds=2008-04-08 +POSTHOOK: Input: default@srcpart_small@ds=2008-04-09 +#### A masked pattern was here #### +0 +PREHOOK: query: EXPLAIN select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.key1) join alltypesorc_int on (srcpart_date.value = alltypesorc_int.cstring) +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.key1) join alltypesorc_int on (srcpart_date.value = alltypesorc_int.cstring) +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Map 7 <- Reducer 5 (BROADCAST_EDGE) + Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 6 (SIMPLE_EDGE) + Reducer 3 <- Map 7 (SIMPLE_EDGE), Reducer 2 (SIMPLE_EDGE) + Reducer 4 <- Reducer 3 (SIMPLE_EDGE) + Reducer 5 <- Reducer 2 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: srcpart_date + filterExpr: (key is not null and value is not null) (type: boolean) + Statistics: Num rows: 2000 Data size: 368000 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (key is not null and value is not null) (type: boolean) + Statistics: Num rows: 500 Data size: 92000 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string), value (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 500 Data size: 92000 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 500 Data size: 92000 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: string) + Execution mode: llap + LLAP IO: all inputs + Map 6 + Map Operator Tree: + TableScan + alias: srcpart_small + filterExpr: key1 is not null (type: boolean) + Statistics: Num rows: 1000 Data size: 184000 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: key1 is not null (type: boolean) + Statistics: Num rows: 500 Data size: 92000 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key1 (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 500 Data size: 92000 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 500 Data size: 92000 Basic stats: COMPLETE Column stats: NONE + Execution mode: llap + LLAP IO: all inputs + Map 7 + Map Operator Tree: + TableScan + alias: alltypesorc_int + filterExpr: (cstring is not null and cstring BETWEEN DynamicValue(RS_12_srcpart_date_cstring_min) AND DynamicValue(RS_12_srcpart_date_cstring_max) and in_bloom_filter(cstring, DynamicValue(RS_12_srcpart_date_cstring_bloom_filter))) (type: boolean) + Statistics: Num rows: 12288 Data size: 926570 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (cstring is not null and cstring BETWEEN DynamicValue(RS_12_srcpart_date_cstring_min) AND DynamicValue(RS_12_srcpart_date_cstring_max) and in_bloom_filter(cstring, DynamicValue(RS_12_srcpart_date_cstring_bloom_filter))) (type: boolean) + Statistics: Num rows: 6144 Data size: 463285 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: cstring (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 6144 Data size: 463285 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 6144 Data size: 463285 Basic stats: COMPLETE Column stats: NONE + Execution mode: llap + LLAP IO: all inputs + Reducer 2 + Execution mode: llap + Reduce Operator Tree: + Merge Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 _col0 (type: string) + 1 _col0 (type: string) + outputColumnNames: _col1 + Statistics: Num rows: 550 Data size: 101200 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col1 (type: string) + sort order: + + Map-reduce partition columns: _col1 (type: string) + Statistics: Num rows: 550 Data size: 101200 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col1 (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 550 Data size: 101200 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: min(_col0), max(_col0), bloom_filter(_col0, expectedEntires=550) + mode: hash + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: string), _col1 (type: string), _col2 (type: binary) + Reducer 3 + Execution mode: llap + Reduce Operator Tree: + Merge Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 _col1 (type: string) + 1 _col0 (type: string) + Statistics: Num rows: 6758 Data size: 509613 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count() + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: bigint) + Reducer 4 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Reducer 5 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: min(VALUE._col0), max(VALUE._col1), bloom_filter(VALUE._col2, expectedEntires=550) + mode: final + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: string), _col1 (type: string), _col2 (type: binary) + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.key1) join alltypesorc_int on (srcpart_date.value = alltypesorc_int.cstring) +PREHOOK: type: QUERY +PREHOOK: Input: default@alltypesorc_int +PREHOOK: Input: default@srcpart_date +PREHOOK: Input: default@srcpart_date@ds=2008-04-08 +PREHOOK: Input: default@srcpart_date@ds=2008-04-09 +PREHOOK: Input: default@srcpart_small +PREHOOK: Input: default@srcpart_small@ds=2008-04-08 +PREHOOK: Input: default@srcpart_small@ds=2008-04-09 +#### A masked pattern was here #### +POSTHOOK: query: select count(*) from srcpart_date join srcpart_small on (srcpart_date.key = srcpart_small.key1) join alltypesorc_int on (srcpart_date.value = alltypesorc_int.cstring) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@alltypesorc_int +POSTHOOK: Input: default@srcpart_date +POSTHOOK: Input: default@srcpart_date@ds=2008-04-08 +POSTHOOK: Input: default@srcpart_date@ds=2008-04-09 +POSTHOOK: Input: default@srcpart_small +POSTHOOK: Input: default@srcpart_small@ds=2008-04-08 +POSTHOOK: Input: default@srcpart_small@ds=2008-04-09 +#### A masked pattern was here #### +0 +PREHOOK: query: drop table srcpart_date +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@srcpart_date +PREHOOK: Output: default@srcpart_date +POSTHOOK: query: drop table srcpart_date +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@srcpart_date +POSTHOOK: Output: default@srcpart_date +PREHOOK: query: drop table srcpart_small +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@srcpart_small +PREHOOK: Output: default@srcpart_small +POSTHOOK: query: drop table srcpart_small +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@srcpart_small +POSTHOOK: Output: default@srcpart_small +PREHOOK: query: drop table alltypesorc_int +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@alltypesorc_int +PREHOOK: Output: default@alltypesorc_int +POSTHOOK: query: drop table alltypesorc_int +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@alltypesorc_int +POSTHOOK: Output: default@alltypesorc_int diff --git a/storage-api/src/java/org/apache/hadoop/hive/ql/io/sarg/LiteralDelegate.java b/storage-api/src/java/org/apache/hadoop/hive/ql/io/sarg/LiteralDelegate.java new file mode 100644 index 0000000..bd8a5ce --- /dev/null +++ b/storage-api/src/java/org/apache/hadoop/hive/ql/io/sarg/LiteralDelegate.java @@ -0,0 +1,31 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.io.sarg; + +import org.apache.hadoop.conf.Configurable; + +/** + * Interface to retrieve a literal value + */ +public interface LiteralDelegate extends Configurable { + + Object getLiteral(); + + String getId(); +} diff --git a/storage-api/src/java/org/apache/hadoop/hive/ql/io/sarg/SearchArgumentFactory.java b/storage-api/src/java/org/apache/hadoop/hive/ql/io/sarg/SearchArgumentFactory.java index 8fda95c..3c10c83 100644 --- a/storage-api/src/java/org/apache/hadoop/hive/ql/io/sarg/SearchArgumentFactory.java +++ b/storage-api/src/java/org/apache/hadoop/hive/ql/io/sarg/SearchArgumentFactory.java @@ -18,13 +18,20 @@ package org.apache.hadoop.hive.ql.io.sarg; +import org.apache.hadoop.conf.Configuration; + /** * A factory for creating SearchArguments, as well as modifying those created by this factory. */ public class SearchArgumentFactory { public static SearchArgument.Builder newBuilder() { - return new SearchArgumentImpl.BuilderImpl(); + return newBuilder(null); + } + + public static SearchArgument.Builder newBuilder(Configuration conf) { + return new SearchArgumentImpl.BuilderImpl(conf); } + public static void setPredicateLeafColumn(PredicateLeaf leaf, String newName) { SearchArgumentImpl.PredicateLeafImpl.setColumnName(leaf, newName); } diff --git a/storage-api/src/java/org/apache/hadoop/hive/ql/io/sarg/SearchArgumentImpl.java b/storage-api/src/java/org/apache/hadoop/hive/ql/io/sarg/SearchArgumentImpl.java index 10d8c51..db0a582 100644 --- a/storage-api/src/java/org/apache/hadoop/hive/ql/io/sarg/SearchArgumentImpl.java +++ b/storage-api/src/java/org/apache/hadoop/hive/ql/io/sarg/SearchArgumentImpl.java @@ -31,6 +31,8 @@ import java.util.Queue; import java.util.Set; +import org.apache.hadoop.conf.Configuration; + /** * The implementation of SearchArguments. Visible for testing only. */ @@ -57,27 +59,17 @@ public PredicateLeafImpl(Operator operator, Type type, String columnName, Object literal, - List literalList) { + List literalList, Configuration conf) { this.operator = operator; this.type = type; this.columnName = columnName; this.literal = literal; - if (literal != null) { - if (literal.getClass() != type.getValueClass()) { - throw new IllegalArgumentException("Wrong value class " + - literal.getClass().getName() + " for " + type + "." + operator + - " leaf"); - } - } + checkLiteralType(literal, type, conf); this.literalList = literalList; if (literalList != null) { Class valueCls = type.getValueClass(); for(Object lit: literalList) { - if (lit != null && lit.getClass() != valueCls) { - throw new IllegalArgumentException("Wrong value class item " + - lit.getClass().getName() + " for " + type + "." + operator + - " leaf"); - } + checkLiteralType(lit, type, conf); } } } @@ -99,6 +91,10 @@ public String getColumnName() { @Override public Object getLiteral() { + if (literal instanceof LiteralDelegate) { + return ((LiteralDelegate) literal).getLiteral(); + } + // To get around a kryo 2.22 bug while deserialize a Timestamp into Date // (https://github.com/EsotericSoftware/kryo/issues/88) // When we see a Date, convert back into Timestamp @@ -110,6 +106,13 @@ public Object getLiteral() { @Override public List getLiteralList() { + if (literalList != null && literalList.size() > 0 && literalList.get(0) instanceof LiteralDelegate) { + List newLiteraList = new ArrayList(); + for (Object litertalObj : literalList) { + newLiteraList.add(((LiteralDelegate) litertalObj).getLiteral()); + } + return newLiteraList; + } return literalList; } @@ -169,6 +172,23 @@ public static void setColumnName(PredicateLeaf leaf, String newName) { assert leaf instanceof PredicateLeafImpl; ((PredicateLeafImpl)leaf).columnName = newName; } + + protected void checkLiteralType(Object literal, Type type, Configuration conf) { + if (literal == null) { + return; + } + + if (literal instanceof LiteralDelegate) { + // Give it a pass. Optionally, have LiteralDelegate provide a getLiteralClass() to check. + ((LiteralDelegate) literal).setConf(conf); + } else { + if (literal.getClass() != type.getValueClass()) { + throw new IllegalArgumentException("Wrong value class " + + literal.getClass().getName() + " for " + type + "." + operator + + " leaf"); + } + } + } } private final List leaves; @@ -218,6 +238,11 @@ public String toString() { static class BuilderImpl implements Builder { + Configuration conf; + public BuilderImpl(Configuration conf) { + this.conf = conf; + } + // max threshold for CNF conversion. having >8 elements in andList will be // converted to maybe private static final int CNF_COMBINATIONS_THRESHOLD = 256; @@ -291,7 +316,7 @@ public Builder lessThan(String column, PredicateLeaf.Type type, } else { PredicateLeaf leaf = new PredicateLeafImpl(PredicateLeaf.Operator.LESS_THAN, - type, column, literal, null); + type, column, literal, null, conf); parent.getChildren().add(new ExpressionTree(addLeaf(leaf))); } return this; @@ -306,7 +331,7 @@ public Builder lessThanEquals(String column, PredicateLeaf.Type type, } else { PredicateLeaf leaf = new PredicateLeafImpl(PredicateLeaf.Operator.LESS_THAN_EQUALS, - type, column, literal, null); + type, column, literal, null, conf); parent.getChildren().add(new ExpressionTree(addLeaf(leaf))); } return this; @@ -321,7 +346,7 @@ public Builder equals(String column, PredicateLeaf.Type type, } else { PredicateLeaf leaf = new PredicateLeafImpl(PredicateLeaf.Operator.EQUALS, - type, column, literal, null); + type, column, literal, null, conf); parent.getChildren().add(new ExpressionTree(addLeaf(leaf))); } return this; @@ -336,7 +361,7 @@ public Builder nullSafeEquals(String column, PredicateLeaf.Type type, } else { PredicateLeaf leaf = new PredicateLeafImpl(PredicateLeaf.Operator.NULL_SAFE_EQUALS, - type, column, literal, null); + type, column, literal, null, conf); parent.getChildren().add(new ExpressionTree(addLeaf(leaf))); } return this; @@ -358,7 +383,7 @@ public Builder in(String column, PredicateLeaf.Type type, PredicateLeaf leaf = new PredicateLeafImpl(PredicateLeaf.Operator.IN, - type, column, null, argList); + type, column, null, argList, conf); parent.getChildren().add(new ExpressionTree(addLeaf(leaf))); } return this; @@ -372,7 +397,7 @@ public Builder isNull(String column, PredicateLeaf.Type type) { } else { PredicateLeaf leaf = new PredicateLeafImpl(PredicateLeaf.Operator.IS_NULL, - type, column, null, null); + type, column, null, null, conf); parent.getChildren().add(new ExpressionTree(addLeaf(leaf))); } return this; @@ -390,7 +415,7 @@ public Builder between(String column, PredicateLeaf.Type type, Object lower, argList.add(upper); PredicateLeaf leaf = new PredicateLeafImpl(PredicateLeaf.Operator.BETWEEN, - type, column, null, argList); + type, column, null, argList, conf); parent.getChildren().add(new ExpressionTree(addLeaf(leaf))); } return this; diff --git a/storage-api/src/java/org/apache/hive/common/util/BloomFilter.java b/storage-api/src/java/org/apache/hive/common/util/BloomFilter.java index e60690d..d44bba8 100644 --- a/storage-api/src/java/org/apache/hive/common/util/BloomFilter.java +++ b/storage-api/src/java/org/apache/hive/common/util/BloomFilter.java @@ -18,6 +18,8 @@ package org.apache.hive.common.util; +import java.io.*; +import java.util.ArrayList; import java.util.Arrays; import java.util.List; @@ -242,6 +244,55 @@ public void reset() { } /** + * Serialize a bloom filter + * @param out output stream to write to + * @param bloomFilter BloomFilter that needs to be seralized + */ + public static void serialize(OutputStream out, BloomFilter bloomFilter) throws IOException { + /** + * Serialized BloomFilter format: + * 1 byte for the number of hash functions. + * 1 big endian int(That is how OutputStream works) for the number of longs in the bitset + * big endina longs in the BloomFilter bitset + */ + DataOutputStream dataOutputStream = new DataOutputStream(out); + dataOutputStream.writeByte(bloomFilter.numHashFunctions); + dataOutputStream.writeInt(bloomFilter.numBits); + for (long value : bloomFilter.getBitSet()) { + dataOutputStream.writeLong(value); + } + } + + /** + * Deserialize a bloom filter + * Read a byte stream, which was written by {@linkplain #serialize(OutputStream, BloomFilter)} + * into a {@code BloomFilter} + * @param in input bytestream + * @return deserialized BloomFilter + */ + public static BloomFilter deserialize(InputStream in) throws IOException { + if (in == null) { + throw new IOException("Input stream is null"); + } + + try { + DataInputStream dataInputStream = new DataInputStream(in); + int numHashFunc = dataInputStream.readByte(); + int numBits = dataInputStream.readInt(); + int sz = (numBits/Long.SIZE); + List data = new ArrayList(); + for (int i = 0; i < sz; i++) { + data.add(dataInputStream.readLong()); + } + return new BloomFilter(data, numBits, numHashFunc); + } catch (RuntimeException e) { + IOException io = new IOException( "Unable to deserialize BloomFilter"); + io.initCause(e); + throw io; + } + } + + /** * Bare metal bit set implementation. For performance reasons, this implementation does not check * for index bounds nor expand the bit set size if the specified index is greater than the size. */