diff --git common/src/java/org/apache/hadoop/hive/conf/HiveConf.java common/src/java/org/apache/hadoop/hive/conf/HiveConf.java index a120b45..1714a8a 100644 --- common/src/java/org/apache/hadoop/hive/conf/HiveConf.java +++ common/src/java/org/apache/hadoop/hive/conf/HiveConf.java @@ -2494,6 +2494,9 @@ "higher compute cost. (NDV means the number of distinct values.). It only affects the FM-Sketch \n" + "(not the HLL algorithm which is the default), where it computes the number of necessary\n" + " bitvectors to achieve the accuracy."), + HIVE_STATS_USE_UDF_ESTIMATORS("hive.stats.use.statestimators", true, + "Statestimators are able to provide more accurate column statistic infos for UDF results."), + /** * @deprecated Use MetastoreConf.STATS_NDV_TUNER */ diff --git ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java index cb2d0a7..2673580 100644 --- ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java +++ ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java @@ -27,6 +27,7 @@ import java.util.List; import java.util.Map; import java.util.Map.Entry; +import java.util.Optional; import java.util.Set; import java.util.concurrent.Callable; import java.util.concurrent.ExecutionException; @@ -73,6 +74,9 @@ import org.apache.hadoop.hive.ql.plan.Statistics; import org.apache.hadoop.hive.ql.plan.Statistics.State; import org.apache.hadoop.hive.ql.stats.BasicStats.Factory; +import org.apache.hadoop.hive.ql.stats.estimator.IStatEstimator; +import org.apache.hadoop.hive.ql.stats.estimator.IStatEstimatorProvider; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFSum; import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFBridge; import org.apache.hadoop.hive.ql.udf.generic.NDV; @@ -81,6 +85,7 @@ import org.apache.hadoop.hive.serde2.objectinspector.ConstantObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category; +import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory; import org.apache.hadoop.hive.serde2.objectinspector.StandardConstantListObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.StandardConstantMapObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.StandardConstantStructObjectInspector; @@ -1528,18 +1533,7 @@ return null; } } else if (end instanceof ExprNodeConstantDesc) { - - // constant projection - ExprNodeConstantDesc encd = (ExprNodeConstantDesc) end; - - colName = encd.getName(); - colType = encd.getTypeString(); - if (encd.getValue() == null) { - // null projection - numNulls = numRows; - } else { - countDistincts = 1; - } + return buildColStatForConstant(conf, numRows, (ExprNodeConstantDesc) end); } else if (end instanceof ExprNodeGenericFuncDesc) { ExprNodeGenericFuncDesc engfd = (ExprNodeGenericFuncDesc) end; colName = engfd.getName(); @@ -1560,6 +1554,27 @@ } } + if (conf.getBoolVar(ConfVars.HIVE_STATS_USE_UDF_ESTIMATORS)) { + Optional sep = engfd.getGenericUDF().adapt(IStatEstimatorProvider.class); + if (sep.isPresent()) { + Optional se = sep.get().getStatEstimator(); + if (se.isPresent()) { + List csList = new ArrayList(); + for (ExprNodeDesc child : engfd.getChildren()) { + ColStatistics cs = getColStatisticsFromExpression(conf, parentStats, child); + csList.add(cs); + } + Optional res = se.get().estimate(csList); + if (res.isPresent()) { + ColStatistics newStats = res.get(); + colType = colType.toLowerCase(); + newStats.setColumnType(colType); + newStats.setColumnName(colName); + return newStats; + } + } + } + } // fallback to default countDistincts = getNDVFor(engfd, numRows, parentStats); } else if (end instanceof ExprNodeColumnListDesc) { @@ -1590,6 +1605,43 @@ return colStats; } + private static ColStatistics buildColStatForConstant(HiveConf conf, long numRows, ExprNodeConstantDesc encd) { + + long numNulls = 0; + long countDistincts = 0; + if (encd.getValue() == null) { + // null projection + numNulls = numRows; + } else { + countDistincts = 1; + } + String colType = encd.getTypeString(); + colType = colType.toLowerCase(); + ObjectInspector oi = encd.getWritableObjectInspector(); + double avgColSize = getAvgColLenOf(conf, oi, colType); + ColStatistics colStats = new ColStatistics(encd.getName(), colType); + colStats.setAvgColLen(avgColSize); + colStats.setCountDistint(countDistincts); + colStats.setNumNulls(numNulls); + + Optional value = getLongConstValue(encd); + if (value.isPresent()) { + colStats.setRange(value.get(), value.get()); + } + return colStats; + } + + private static Optional getLongConstValue(ExprNodeConstantDesc encd) { + if (encd.getValue() != null) { + String constant = encd.getValue().toString(); + PrimitiveCategory category = GenericUDAFSum.getReturnType(encd.getTypeInfo()); + if (category == PrimitiveCategory.LONG) { + return Optional.of(Long.parseLong(constant)); + } + } + return Optional.empty(); + } + private static boolean isWideningCast(ExprNodeGenericFuncDesc engfd) { GenericUDF udf = engfd.getGenericUDF(); if (!FunctionRegistry.isOpCast(udf)) { diff --git ql/src/java/org/apache/hadoop/hive/ql/stats/estimator/IStatEstimator.java ql/src/java/org/apache/hadoop/hive/ql/stats/estimator/IStatEstimator.java new file mode 100644 index 0000000..a72ecd2 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/stats/estimator/IStatEstimator.java @@ -0,0 +1,37 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.stats.estimator; + +import java.util.List; +import java.util.Optional; + +import org.apache.hadoop.hive.ql.plan.ColStatistics; + +/** + * Enables statistics related computation on UDFs + */ +public interface IStatEstimator { + + /** + * Computes the output statistics of the actual UDF. + * + * @param argStats the statistics for every argument of the UDF. + */ + public Optional estimate(List argStats); +} diff --git ql/src/java/org/apache/hadoop/hive/ql/stats/estimator/IStatEstimatorProvider.java ql/src/java/org/apache/hadoop/hive/ql/stats/estimator/IStatEstimatorProvider.java new file mode 100644 index 0000000..be9a934 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/stats/estimator/IStatEstimatorProvider.java @@ -0,0 +1,31 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.stats.estimator; + +import java.util.Optional; + +/** + * Marker interface for UDFs to communicate that the usage of StatEstimators is supported by the UDF. + */ +public interface IStatEstimatorProvider { + + /** + * Returns the stat estimator for the given UDF instance. + */ + public Optional getStatEstimator(); +} diff --git ql/src/java/org/apache/hadoop/hive/ql/stats/estimator/StatEstimators.java ql/src/java/org/apache/hadoop/hive/ql/stats/estimator/StatEstimators.java new file mode 100644 index 0000000..05d894a --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/stats/estimator/StatEstimators.java @@ -0,0 +1,51 @@ +package org.apache.hadoop.hive.ql.stats.estimator; + +import java.util.Optional; + +import org.apache.hadoop.hive.ql.plan.ColStatistics; + +public class StatEstimators { + + public static class WorstStatCombiner { + + private boolean inited; + private ColStatistics result; + + public void add(ColStatistics stat) { + if (!inited) { + inited = true; + result = stat.clone(); + result.setRange(null); + result.setIsEstimated(true); + return; + } else { + + if (stat.getAvgColLen() > result.getAvgColLen()) { + result.setAvgColLen(stat.getAvgColLen()); + } + if (stat.getCountDistint() > result.getCountDistint()) { + result.setCountDistint(stat.getCountDistint()); + } + if (stat.getNumNulls() > result.getNumNulls()) { + result.setNumNulls(stat.getNumNulls()); + } + if (stat.getNumTrues() > result.getNumTrues()) { + result.setNumTrues(stat.getNumTrues()); + } + if (stat.getNumFalses() > result.getNumFalses()) { + result.setNumFalses(stat.getNumFalses()); + } + if (stat.isFilteredColumn()) { + result.setFilterColumn(); + } + + } + + } + public Optional getResult() { + return Optional.of(result); + + } + } + +} diff --git ql/src/java/org/apache/hadoop/hive/ql/udf/UDFSubstr.java ql/src/java/org/apache/hadoop/hive/ql/udf/UDFSubstr.java index 5b1964c..e20b009 100755 --- ql/src/java/org/apache/hadoop/hive/ql/udf/UDFSubstr.java +++ ql/src/java/org/apache/hadoop/hive/ql/udf/UDFSubstr.java @@ -19,12 +19,18 @@ package org.apache.hadoop.hive.ql.udf; import java.util.Arrays; +import java.util.List; +import java.util.Optional; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDF; import org.apache.hadoop.hive.ql.exec.vector.VectorizedExpressions; import org.apache.hadoop.hive.ql.exec.vector.expressions.StringSubstrColStart; import org.apache.hadoop.hive.ql.exec.vector.expressions.StringSubstrColStartLen; +import org.apache.hadoop.hive.ql.plan.ColStatistics; +import org.apache.hadoop.hive.ql.plan.ColStatistics.Range; +import org.apache.hadoop.hive.ql.stats.estimator.IStatEstimator; +import org.apache.hadoop.hive.ql.stats.estimator.IStatEstimatorProvider; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; @@ -48,7 +54,7 @@ + " > SELECT _FUNC_('Facebook', 5, 1) FROM src LIMIT 1;\n" + " 'b'") @VectorizedExpressions({StringSubstrColStart.class, StringSubstrColStartLen.class}) -public class UDFSubstr extends UDF { +public class UDFSubstr extends UDF implements IStatEstimatorProvider { private final int[] index; private final Text r; @@ -131,4 +137,52 @@ public BytesWritable evaluate(BytesWritable bw, IntWritable pos){ return evaluate(bw, pos, maxValue); } + + @Override + public Optional getStatEstimator() { + return Optional.of(new SubStrStatEstimator()); + } + + private static class SubStrStatEstimator implements IStatEstimator { + + @Override + public Optional estimate(List csList) { + ColStatistics cs = csList.get(0).clone(); + + // this might bad in a skewed case; consider: + // 1 row with 1000 long string + // 99 rows with 0 length + // orig avg is 10 + // new avg is 5 (if substr(5)) ; but in reality it will stay ~10 + Optional start = getRangeWidth(csList.get(1).getRange()); + Range startRange = csList.get(1).getRange(); + if (startRange != null && startRange.minValue != null) { + double newAvgColLen = cs.getAvgColLen() - startRange.minValue.doubleValue(); + if (newAvgColLen > 0) { + cs.setAvgColLen(newAvgColLen); + } + + } + + if (csList.size() >= 2) { + Range lengthRange = csList.get(2).getRange(); + if (lengthRange != null && lengthRange.maxValue != null) { + Double w = lengthRange.maxValue.doubleValue(); + if (cs.getAvgColLen() > w) { + cs.setAvgColLen(w); + } + } + } + + return Optional.of(cs); + } + + private Optional getRangeWidth(Range range) { + if (range.minValue != null && range.maxValue != null) { + return Optional.of(range.maxValue.doubleValue() - range.minValue.doubleValue()); + } + return Optional.empty(); + } + + } } diff --git ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDF.java ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDF.java index 6597f4b..c1bf325 100644 --- ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDF.java +++ ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDF.java @@ -20,6 +20,7 @@ import java.io.Closeable; import java.io.IOException; +import java.util.Optional; import org.apache.hadoop.hive.common.classification.InterfaceAudience; import org.apache.hadoop.hive.common.classification.InterfaceStability; @@ -638,4 +639,12 @@ return i + ORDINAL_SUFFIXES[i % 10]; } } + + @SuppressWarnings("unchecked") + public Optional adapt(Class clazz) { + if (clazz.isInstance(this)) { + return Optional.of((T) this); + } + return Optional.empty(); + } } diff --git ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFBridge.java ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFBridge.java index 7a644fc..377ab37 100644 --- ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFBridge.java +++ ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFBridge.java @@ -21,6 +21,7 @@ import java.io.Serializable; import java.lang.reflect.Method; import java.util.ArrayList; +import java.util.Optional; import org.apache.hadoop.hive.common.type.HiveDecimal; import org.apache.hadoop.hive.ql.exec.FunctionRegistry; @@ -28,6 +29,8 @@ import org.apache.hadoop.hive.ql.exec.UDFArgumentException; import org.apache.hadoop.hive.ql.exec.Utilities; import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.stats.estimator.IStatEstimator; +import org.apache.hadoop.hive.ql.stats.estimator.IStatEstimatorProvider; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFUtils.ConversionHelper; import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; @@ -44,7 +47,7 @@ * UDF class needs to be serialized with the plan. * */ -public class GenericUDFBridge extends GenericUDF implements Serializable { +public class GenericUDFBridge extends GenericUDF implements Serializable, IStatEstimatorProvider { private static final long serialVersionUID = 4994861742809511113L; /** @@ -97,7 +100,7 @@ this.isOperator = isOperator; this.udfClassName = udfClassName; } - + // For Java serialization only public GenericUDFBridge() { } @@ -151,7 +154,7 @@ public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException { try { - udf = (UDF)getUdfClassInternal().newInstance(); + udf = getUdfClassInternal().newInstance(); } catch (Exception e) { throw new UDFArgumentException( "Unable to instantiate UDF implementation class " + udfClassName + ": " + e); @@ -249,4 +252,22 @@ public interface UdfWhitelistChecker { boolean isUdfAllowed(Class clazz); } + + @SuppressWarnings("unchecked") + @Override + public Optional adapt(Class clazz) { + if (clazz.isInstance(udf)) { + return Optional.of((T) udf); + } + return super.adapt(clazz); + } + + @Override + public Optional getStatEstimator() { + if (IStatEstimatorProvider.class.isInstance(udf)) { + IStatEstimatorProvider sep = (IStatEstimatorProvider) udf; + return sep.getStatEstimator(); + } + return Optional.empty(); + } } diff --git ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFCase.java ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFCase.java index 06e9d00..a5fec3b 100644 --- ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFCase.java +++ ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFCase.java @@ -18,16 +18,24 @@ package org.apache.hadoop.hive.ql.udf.generic; +import java.util.List; +import java.util.Optional; + import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.plan.ColStatistics; +import org.apache.hadoop.hive.ql.stats.estimator.IStatEstimator; +import org.apache.hadoop.hive.ql.stats.estimator.IStatEstimatorProvider; +import org.apache.hadoop.hive.ql.stats.estimator.StatEstimators; +import org.apache.hadoop.hive.ql.stats.estimator.StatEstimators.WorstStatCombiner; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorUtils; /** * GenericUDF Class for SQL construct "CASE a WHEN b THEN c [ELSE f] END". - * + * * NOTES: 1. a and b should be compatible, or an exception will be * thrown. 2. c and f should be compatible types, or an exception will be * thrown. @@ -49,7 +57,7 @@ + " END\n" + " FROM emp_details") -public class GenericUDFCase extends GenericUDF { +public class GenericUDFCase extends GenericUDF implements IStatEstimatorProvider { private transient ObjectInspector[] argumentOIs; private transient GenericUDFUtils.ReturnObjectInspectorResolver returnOIResolver; private transient GenericUDFUtils.ReturnObjectInspectorResolver caseOIResolver; @@ -138,4 +146,23 @@ return sb.toString(); } + @Override + public Optional getStatEstimator() { + return Optional.of(new CaseStatEstimator()); + } + + static class CaseStatEstimator implements IStatEstimator { + + @Override + public Optional estimate(List argStats) { + WorstStatCombiner combiner = new StatEstimators.WorstStatCombiner(); + for (int i = 1; i < argStats.size(); i += 2) { + combiner.add(argStats.get(i)); + } + combiner.add(argStats.get(argStats.size() - 1)); + return combiner.getResult(); + } + } + + } diff --git ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFCoalesce.java ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFCoalesce.java index 8ebe9e0..26b895c 100644 --- ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFCoalesce.java +++ ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFCoalesce.java @@ -18,9 +18,17 @@ package org.apache.hadoop.hive.ql.udf.generic; +import java.util.List; +import java.util.Optional; + import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.plan.ColStatistics; +import org.apache.hadoop.hive.ql.stats.estimator.IStatEstimator; +import org.apache.hadoop.hive.ql.stats.estimator.IStatEstimatorProvider; +import org.apache.hadoop.hive.ql.stats.estimator.StatEstimators; +import org.apache.hadoop.hive.ql.stats.estimator.StatEstimators.WorstStatCombiner; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; /** @@ -33,7 +41,7 @@ value = "_FUNC_(a1, a2, ...) - Returns the first non-null argument", extended = "Example:\n" + " > SELECT _FUNC_(NULL, 1, NULL) FROM src LIMIT 1;\n" + " 1") -public class GenericUDFCoalesce extends GenericUDF { +public class GenericUDFCoalesce extends GenericUDF implements IStatEstimatorProvider { private transient ObjectInspector[] argumentOIs; private transient GenericUDFUtils.ReturnObjectInspectorResolver returnOIResolver; @@ -72,4 +80,20 @@ return getStandardDisplayString("COALESCE", children, ","); } + @Override + public Optional getStatEstimator() { + return Optional.of(new CoalesceStatEstimator()); + } + + static class CoalesceStatEstimator implements IStatEstimator { + + @Override + public Optional estimate(List argStats) { + WorstStatCombiner combiner = new StatEstimators.WorstStatCombiner(); + for (int i = 0; i < argStats.size(); i++) { + combiner.add(argStats.get(i)); + } + return combiner.getResult(); + } + } } diff --git ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFIf.java ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFIf.java index 23708dc..aa5faab 100644 --- ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFIf.java +++ ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFIf.java @@ -18,6 +18,9 @@ package org.apache.hadoop.hive.ql.udf.generic; +import java.util.List; +import java.util.Optional; + import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDFArgumentException; import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException; @@ -25,6 +28,11 @@ import org.apache.hadoop.hive.ql.exec.vector.VectorizedExpressions; import org.apache.hadoop.hive.ql.exec.vector.VectorizedExpressionsSupportDecimal64; import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.plan.ColStatistics; +import org.apache.hadoop.hive.ql.stats.estimator.IStatEstimator; +import org.apache.hadoop.hive.ql.stats.estimator.IStatEstimatorProvider; +import org.apache.hadoop.hive.ql.stats.estimator.StatEstimators; +import org.apache.hadoop.hive.ql.stats.estimator.StatEstimators.WorstStatCombiner; import org.apache.hadoop.hive.serde.serdeConstants; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; @@ -115,7 +123,7 @@ IfExprTimestampScalarColumn.class, IfExprTimestampScalarScalar.class, }) @VectorizedExpressionsSupportDecimal64() -public class GenericUDFIf extends GenericUDF { +public class GenericUDFIf extends GenericUDF implements IStatEstimatorProvider { private transient ObjectInspector[] argumentOIs; private transient GenericUDFUtils.ReturnObjectInspectorResolver returnOIResolver; @@ -171,4 +179,22 @@ assert (children.length == 3); return getStandardDisplayString("if", children); } + + @Override + public Optional getStatEstimator() { + return Optional.of(new IfStatEstimator()); + } + + static class IfStatEstimator implements IStatEstimator { + + @Override + public Optional estimate(List argStats) { + WorstStatCombiner combiner = new StatEstimators.WorstStatCombiner(); + combiner.add(argStats.get(1)); + combiner.add(argStats.get(2)); + return combiner.getResult(); + } + + } + } diff --git ql/src/test/queries/clientpositive/udf_coalesce.q ql/src/test/queries/clientpositive/udf_coalesce.q index 7d87580..6c6594a 100644 --- ql/src/test/queries/clientpositive/udf_coalesce.q +++ ql/src/test/queries/clientpositive/udf_coalesce.q @@ -1,6 +1,7 @@ --! qt:dataset:src_thrift --! qt:dataset:src -set hive.fetch.task.conversion=more; +set hive.cbo.enable=false; +set hive.fetch.task.conversion=none; DESCRIBE FUNCTION coalesce; DESCRIBE FUNCTION EXTENDED coalesce; @@ -47,6 +48,10 @@ FROM src tablesample (1 rows); EXPLAIN +SELECT COALESCE(key,'x') from src limit 1; + + +EXPLAIN SELECT COALESCE(src_thrift.lint[1], 999), COALESCE(src_thrift.lintstring[0].mystring, '999'), COALESCE(src_thrift.mstringstring['key_2'], '999') diff --git ql/src/test/queries/clientpositive/w1.q ql/src/test/queries/clientpositive/w1.q new file mode 100644 index 0000000..4f259a7 --- /dev/null +++ ql/src/test/queries/clientpositive/w1.q @@ -0,0 +1,14 @@ +set hive.fetch.task.conversion=none; + +create table t (a string); + +insert into t values +('aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'), +('aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'), +('aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'); + +explain analyze +select a from t; + +explain analyze +select substr(a,1,4) from t; diff --git ql/src/test/results/clientpositive/llap/udf_coalesce.q.out ql/src/test/results/clientpositive/llap/udf_coalesce.q.out index f35ec7a..a934fdc 100644 --- ql/src/test/results/clientpositive/llap/udf_coalesce.q.out +++ ql/src/test/results/clientpositive/llap/udf_coalesce.q.out @@ -61,20 +61,39 @@ POSTHOOK: Input: default@src #### A masked pattern was here #### STAGE DEPENDENCIES: - Stage-0 is a root stage + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: src + Row Limit Per Split: 1 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: 1 (type: int), 1 (type: int), 2 (type: int), 1 (type: int), 3 (type: int), 4 (type: int), '1' (type: string), '1' (type: string), '2' (type: string), '1' (type: string), '3' (type: string), '4' (type: string), 1 (type: decimal(1,0)), 1 (type: decimal(1,0)), 2 (type: decimal(1,0)), 2 (type: decimal(1,0)), 2 (type: decimal(1,0)), null (type: int) + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13, _col14, _col15, _col16, _col17 + Statistics: Num rows: 500 Data size: 547004 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 500 Data size: 547004 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Execution mode: vectorized, llap + LLAP IO: no inputs + Stage: Stage-0 Fetch Operator limit: -1 Processor Tree: - TableScan - alias: src - Row Limit Per Split: 1 - Select Operator - expressions: 1 (type: int), 1 (type: int), 2 (type: int), 1 (type: int), 3 (type: int), 4 (type: int), '1' (type: string), '1' (type: string), '2' (type: string), '1' (type: string), '3' (type: string), '4' (type: string), 1 (type: decimal(1,0)), 1 (type: decimal(1,0)), 2 (type: decimal(1,0)), 2 (type: decimal(1,0)), 2 (type: decimal(1,0)), null (type: int) - outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13, _col14, _col15, _col16, _col17 - ListSink + ListSink PREHOOK: query: SELECT COALESCE(1), COALESCE(1, 2), @@ -122,6 +141,53 @@ #### A masked pattern was here #### 1 1 2 1 3 4 1 1 2 1 3 4 1 1 2 2 2 NULL PREHOOK: query: EXPLAIN +SELECT COALESCE(key,'x') from src limit 1 +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: EXPLAIN +SELECT COALESCE(key,'x') from src limit 1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 500 Data size: 43500 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: COALESCE(key,'x') (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 500 Data size: 43500 Basic stats: COMPLETE Column stats: COMPLETE + Limit + Number of rows: 1 + Statistics: Num rows: 1 Data size: 87 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 87 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Execution mode: vectorized, llap + LLAP IO: no inputs + + Stage: Stage-0 + Fetch Operator + limit: 1 + Processor Tree: + ListSink + +PREHOOK: query: EXPLAIN SELECT COALESCE(src_thrift.lint[1], 999), COALESCE(src_thrift.lintstring[0].mystring, '999'), COALESCE(src_thrift.mstringstring['key_2'], '999') @@ -138,19 +204,38 @@ POSTHOOK: Input: default@src_thrift #### A masked pattern was here #### STAGE DEPENDENCIES: - Stage-0 is a root stage + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: src_thrift + Statistics: Num rows: 11 Data size: 39600 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: COALESCE(lint[1],999) (type: int), COALESCE(lintstring[0].mystring,'999') (type: string), COALESCE(mstringstring['key_2'],'999') (type: string) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 11 Data size: 39600 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 11 Data size: 39600 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Execution mode: vectorized, llap + LLAP IO: no inputs + Stage: Stage-0 Fetch Operator limit: -1 Processor Tree: - TableScan - alias: src_thrift - Select Operator - expressions: COALESCE(lint[1],999) (type: int), COALESCE(lintstring[0].mystring,'999') (type: string), COALESCE(mstringstring['key_2'],'999') (type: string) - outputColumnNames: _col0, _col1, _col2 - ListSink + ListSink PREHOOK: query: SELECT COALESCE(src_thrift.lint[1], 999), COALESCE(src_thrift.lintstring[0].mystring, '999'), diff --git ql/src/test/results/clientpositive/llap/w1.q.out ql/src/test/results/clientpositive/llap/w1.q.out new file mode 100644 index 0000000..711d42f --- /dev/null +++ ql/src/test/results/clientpositive/llap/w1.q.out @@ -0,0 +1,127 @@ +PREHOOK: query: create table t (a string) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@t +POSTHOOK: query: create table t (a string) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@t +PREHOOK: query: insert into t values +('aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'), +('aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'), +('aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa') +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@t +POSTHOOK: query: insert into t values +('aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'), +('aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'), +('aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa') +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@t +POSTHOOK: Lineage: t.a SCRIPT [] +PREHOOK: query: select a from t +PREHOOK: type: QUERY +PREHOOK: Input: default@t +#### A masked pattern was here #### +POSTHOOK: query: select a from t +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t +#### A masked pattern was here #### +PREHOOK: query: explain analyze +select a from t +PREHOOK: type: QUERY +PREHOOK: Input: default@t +#### A masked pattern was here #### +POSTHOOK: query: explain analyze +select a from t +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t +#### A masked pattern was here #### +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: t + Statistics: Num rows: 3/3 Data size: 360 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: a (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 3/3 Data size: 360 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 3/3 Data size: 360 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Execution mode: vectorized, llap + LLAP IO: no inputs + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select substr(a,1,4) from t +PREHOOK: type: QUERY +PREHOOK: Input: default@t +#### A masked pattern was here #### +POSTHOOK: query: select substr(a,1,4) from t +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t +#### A masked pattern was here #### +PREHOOK: query: explain analyze +select substr(a,1,4) from t +PREHOOK: type: QUERY +PREHOOK: Input: default@t +#### A masked pattern was here #### +POSTHOOK: query: explain analyze +select substr(a,1,4) from t +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t +#### A masked pattern was here #### +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: t + Statistics: Num rows: 3/3 Data size: 360 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: substr(a, 1, 4) (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 3/3 Data size: 264 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 3/3 Data size: 264 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Execution mode: vectorized, llap + LLAP IO: no inputs + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink +