diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java index 6b28be5..33bbdff 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java @@ -608,7 +608,8 @@ public final class FunctionRegistry { * @return The UDAF evaluator */ public static GenericUDAFEvaluator getGenericUDAFEvaluator(String name, - List argumentTypeInfos) throws SemanticException { + List argumentTypeInfos, boolean isDistinct, + boolean isAllColumns) throws SemanticException { GenericUDAFResolver udaf = getGenericUDAFResolver(name); if (udaf == null) { return null; @@ -618,7 +619,7 @@ public final class FunctionRegistry { for (int i = 0; i < parameters.length; i++) { parameters[i] = argumentTypeInfos.get(i); } - return udaf.getEvaluator(parameters); + return udaf.getEvaluator(parameters, isDistinct, isAllColumns); } /** diff --git ql/src/java/org/apache/hadoop/hive/ql/parse/Hive.g ql/src/java/org/apache/hadoop/hive/ql/parse/Hive.g index 956801a..afb0432 100644 --- ql/src/java/org/apache/hadoop/hive/ql/parse/Hive.g +++ ql/src/java/org/apache/hadoop/hive/ql/parse/Hive.g @@ -27,6 +27,7 @@ TOK_ALLCOLREF; TOK_TABLE_OR_COL; TOK_FUNCTION; TOK_FUNCTIONDI; +TOK_FUNCTIONSTAR; TOK_WHERE; TOK_OP_EQ; TOK_OP_NE; @@ -178,15 +179,15 @@ catch (RecognitionException e) { // starting rule statement - : explainStatement EOF - | execStatement EOF - ; + : explainStatement EOF + | execStatement EOF + ; explainStatement @init { msgs.push("explain statement"); } @after { msgs.pop(); } - : KW_EXPLAIN (isExtended=KW_EXTENDED)? execStatement -> ^(TOK_EXPLAIN execStatement $isExtended?) - ; + : KW_EXPLAIN (isExtended=KW_EXTENDED)? execStatement -> ^(TOK_EXPLAIN execStatement $isExtended?) + ; execStatement @init { msgs.push("statement"); } @@ -375,16 +376,16 @@ alterStatementSuffixSerdeProperties alterStatementSuffixFileFormat @init {msgs.push("alter fileformat statement"); } @after {msgs.pop(); } - :name=Identifier KW_SET KW_FILEFORMAT fileFormat - -> ^(TOK_ALTERTABLE_FILEFORMAT $name fileFormat) - ; + :name=Identifier KW_SET KW_FILEFORMAT fileFormat + -> ^(TOK_ALTERTABLE_FILEFORMAT $name fileFormat) + ; alterStatementSuffixClusterbySortby @init {msgs.push("alter cluster by sort by statement");} @after{msgs.pop();} - :name=Identifier tableBuckets - ->^(TOK_ALTERTABLE_CLUSTER_SORT $name tableBuckets) - ; + :name=Identifier tableBuckets + ->^(TOK_ALTERTABLE_CLUSTER_SORT $name tableBuckets) + ; fileFormat @init { msgs.push("file format specification"); } @@ -859,7 +860,7 @@ selectClause @init { msgs.push("select clause"); } @after { msgs.pop(); } : - KW_SELECT hintClause? (((KW_ALL | dist=KW_DISTINCT)? selectList) + KW_SELECT hintClause? (((KW_ALL | dist=KW_DISTINCT)? selectList) | (transform=KW_TRANSFORM selectTrfmClause)) -> {$transform == null && $dist == null}? ^(TOK_SELECT hintClause? selectList) -> {$transform == null && $dist != null}? ^(TOK_SELECTDI hintClause? selectList) @@ -874,7 +875,7 @@ selectList : selectItem ( COMMA selectItem )* -> selectItem+ ; - + selectTrfmClause @init { msgs.push("transform clause"); } @after { msgs.pop(); } @@ -1043,9 +1044,9 @@ joinToken lateralView @init {msgs.push("lateral view"); } @after {msgs.pop(); } - : - KW_LATERAL KW_VIEW function tableAlias KW_AS Identifier (COMMA Identifier)* -> ^(TOK_LATERAL_VIEW ^(TOK_SELECT ^(TOK_SELEXPR function Identifier+ tableAlias))) - ; + : + KW_LATERAL KW_VIEW function tableAlias KW_AS Identifier (COMMA Identifier)* -> ^(TOK_LATERAL_VIEW ^(TOK_SELECT ^(TOK_SELEXPR function Identifier+ tableAlias))) + ; tableAlias @init {msgs.push("table alias"); } @@ -1162,10 +1163,13 @@ function : functionName LPAREN - (dist=KW_DISTINCT)? - (expression (COMMA expression)*)? - RPAREN -> {$dist == null}? ^(TOK_FUNCTION functionName (expression+)?) - -> ^(TOK_FUNCTIONDI functionName (expression+)?) + ( + (star=STAR) + | (dist=KW_DISTINCT)? (expression (COMMA expression)*)? + ) + RPAREN -> {$star != null}? ^(TOK_FUNCTIONSTAR functionName) + -> {$dist == null}? ^(TOK_FUNCTION functionName (expression+)?) + -> ^(TOK_FUNCTIONDI functionName (expression+)?) ; functionName diff --git ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java index 29d93c1..cee352a 100644 --- ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java +++ ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java @@ -313,8 +313,10 @@ public class SemanticAnalyzer extends BaseSemanticAnalyzer { */ private void doPhase1GetAllAggregations(ASTNode expressionTree, HashMap aggregations) { - if (expressionTree.getToken().getType() == HiveParser.TOK_FUNCTION - || expressionTree.getToken().getType() == HiveParser.TOK_FUNCTIONDI) { + int exprTokenType = expressionTree.getToken().getType(); + if (exprTokenType == HiveParser.TOK_FUNCTION + || exprTokenType == HiveParser.TOK_FUNCTIONDI + || exprTokenType == HiveParser.TOK_FUNCTIONSTAR) { assert (expressionTree.getChildCount() != 0); if (expressionTree.getChild(0).getType() == HiveParser.Identifier) { String functionName = unescapeIdentifier(expressionTree.getChild(0) @@ -1709,7 +1711,9 @@ public class SemanticAnalyzer extends BaseSemanticAnalyzer { ASTNode udtfExpr = (ASTNode) selExprList.getChild(posn).getChild(0); GenericUDTF genericUDTF = null; - if (udtfExpr.getType() == HiveParser.TOK_FUNCTION) { + int udtfExprType = udtfExpr.getType(); + if (udtfExprType == HiveParser.TOK_FUNCTION + || udtfExprType == HiveParser.TOK_FUNCTIONSTAR) { String funcName = TypeCheckProcFactory.DefaultExprProcessor .getFunctionText(udtfExpr, true); FunctionInfo fi = FunctionRegistry.getFunctionInfo(funcName); @@ -1918,11 +1922,12 @@ public class SemanticAnalyzer extends BaseSemanticAnalyzer { * for each GroupBy aggregation. */ static GenericUDAFEvaluator getGenericUDAFEvaluator(String aggName, - ArrayList aggParameters, ASTNode aggTree) + ArrayList aggParameters, ASTNode aggTree, + boolean isDistinct, boolean isAllColumns) throws SemanticException { ArrayList originalParameterTypeInfos = getTypeInfo(aggParameters); GenericUDAFEvaluator result = FunctionRegistry.getGenericUDAFEvaluator( - aggName, originalParameterTypeInfos); + aggName, originalParameterTypeInfos, isDistinct, isAllColumns); if (null == result) { String reason = "Looking for UDAF Evaluator\"" + aggName + "\" with parameters " + originalParameterTypeInfos; @@ -2066,9 +2071,10 @@ public class SemanticAnalyzer extends BaseSemanticAnalyzer { } boolean isDistinct = value.getType() == HiveParser.TOK_FUNCTIONDI; + boolean isAllColumns = value.getType() == HiveParser.TOK_FUNCTIONSTAR; Mode amode = groupByDescModeToUDAFMode(mode, isDistinct); GenericUDAFEvaluator genericUDAFEvaluator = getGenericUDAFEvaluator( - aggName, aggParameters, value); + aggName, aggParameters, value, isDistinct, isAllColumns); assert (genericUDAFEvaluator != null); GenericUDAFInfo udaf = getGenericUDAFInfo(genericUDAFEvaluator, amode, aggParameters); @@ -2188,12 +2194,13 @@ public class SemanticAnalyzer extends BaseSemanticAnalyzer { .getIsPartitionCol())); } boolean isDistinct = (value.getType() == HiveParser.TOK_FUNCTIONDI); + boolean isAllColumns = value.getType() == HiveParser.TOK_FUNCTIONSTAR; Mode amode = groupByDescModeToUDAFMode(mode, isDistinct); GenericUDAFEvaluator genericUDAFEvaluator = null; // For distincts, partial aggregations have not been done if (distPartAgg) { genericUDAFEvaluator = getGenericUDAFEvaluator(aggName, aggParameters, - value); + value, isDistinct, isAllColumns); assert (genericUDAFEvaluator != null); genericUDAFEvaluators.put(entry.getKey(), genericUDAFEvaluator); } else { @@ -2305,10 +2312,11 @@ public class SemanticAnalyzer extends BaseSemanticAnalyzer { } boolean isDistinct = value.getType() == HiveParser.TOK_FUNCTIONDI; + boolean isAllColumns = value.getType() == HiveParser.TOK_FUNCTIONSTAR; Mode amode = groupByDescModeToUDAFMode(mode, isDistinct); GenericUDAFEvaluator genericUDAFEvaluator = getGenericUDAFEvaluator( - aggName, aggParameters, value); + aggName, aggParameters, value, isDistinct, isAllColumns); assert (genericUDAFEvaluator != null); GenericUDAFInfo udaf = getGenericUDAFInfo(genericUDAFEvaluator, amode, aggParameters); @@ -2591,6 +2599,7 @@ public class SemanticAnalyzer extends BaseSemanticAnalyzer { String aggName = value.getChild(0).getText(); boolean isDistinct = value.getType() == HiveParser.TOK_FUNCTIONDI; + boolean isStar = value.getType() == HiveParser.TOK_FUNCTIONSTAR; Mode amode = groupByDescModeToUDAFMode(mode, isDistinct); GenericUDAFEvaluator genericUDAFEvaluator = genericUDAFEvaluators .get(entry.getKey()); @@ -3183,8 +3192,8 @@ public class SemanticAnalyzer extends BaseSemanticAnalyzer { dpCtx = new DynamicPartitionCtx(dest_tab, partSpec, conf.getVar(HiveConf.ConfVars.DEFAULTPARTITIONNAME), conf.getIntVar(HiveConf.ConfVars.DYNAMICPARTITIONMAXPARTSPERNODE)); - qbm.setDPCtx(dest, dpCtx); - } + qbm.setDPCtx(dest, dpCtx); + } if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.DYNAMICPARTITIONING)) { // allow DP // TODO: we should support merge files for dynamically generated partitions later @@ -3202,7 +3211,7 @@ public class SemanticAnalyzer extends BaseSemanticAnalyzer { } if (dpCtx.getSPPath() != null) { dest_path = new Path(dest_tab.getPath(), dpCtx.getSPPath()); - } + } if ((dest_tab.getNumBuckets() > 0) && (conf.getBoolVar(HiveConf.ConfVars.HIVEENFORCEBUCKETING))) { dpCtx.setNumBuckets(dest_tab.getNumBuckets()); @@ -3414,10 +3423,10 @@ public class SemanticAnalyzer extends BaseSemanticAnalyzer { queryTmpdir, table_desc, conf.getBoolVar(HiveConf.ConfVars.COMPRESSRESULT), - currentTableId, - rsCtx.isMultiFileSpray(), - rsCtx.getNumFiles(), - rsCtx.getTotalFiles(), + currentTableId, + rsCtx.isMultiFileSpray(), + rsCtx.getNumFiles(), + rsCtx.getTotalFiles(), rsCtx.getPartnCols(), dpCtx), fsRS, input), inputRR); diff --git ql/src/java/org/apache/hadoop/hive/ql/udf/generic/AbstractGenericUDAFResolver.java ql/src/java/org/apache/hadoop/hive/ql/udf/generic/AbstractGenericUDAFResolver.java new file mode 100644 index 0000000..fd73d84 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/udf/generic/AbstractGenericUDAFResolver.java @@ -0,0 +1,41 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.udf.generic; + +import org.apache.hadoop.hive.ql.parse.SemanticException; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; + +/** + * An abstract class to help facilitate existing implementations of + * GenericUDAFResolver to migrate towards the newly introduced API + * {@link GenericUDAFResolver#getEvaluator(TypeInfo[], boolean, boolean)}. This + * class provides a default implementation of this new API and in turn calls + * the existing API {@link GenericUDAFResolver#getEvaluator(TypeInfo[])} by + * ignoring the extra parameters isDistinct and isAllColumns. + * + */ +public abstract class AbstractGenericUDAFResolver implements GenericUDAFResolver +{ + + @SuppressWarnings("deprecation") + @Override + public GenericUDAFEvaluator getEvaluator(TypeInfo[] parameters, + boolean isDistinct, boolean isAllColumns) throws SemanticException { + return getEvaluator(parameters); + } +} diff --git ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFAverage.java ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFAverage.java index 0ef4734..71f11a1 100644 --- ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFAverage.java +++ ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFAverage.java @@ -45,7 +45,7 @@ import org.apache.hadoop.util.StringUtils; * */ @Description(name = "avg", value = "_FUNC_(x) - Returns the mean of a set of numbers") -public class GenericUDAFAverage implements GenericUDAFResolver { +public class GenericUDAFAverage extends AbstractGenericUDAFResolver { static final Log LOG = LogFactory.getLog(GenericUDAFAverage.class.getName()); diff --git ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFBridge.java ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFBridge.java index 0ff41f7..e2628e6 100644 --- ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFBridge.java +++ ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFBridge.java @@ -38,7 +38,7 @@ import org.apache.hadoop.util.ReflectionUtils; * This class is a bridge between GenericUDAF and UDAF. Old UDAF can be used * with the GenericUDAF infrastructure through this bridge. */ -public class GenericUDAFBridge implements GenericUDAFResolver { +public class GenericUDAFBridge extends AbstractGenericUDAFResolver { UDAF udaf; @@ -51,7 +51,8 @@ public class GenericUDAFBridge implements GenericUDAFResolver { } @Override - public GenericUDAFEvaluator getEvaluator(TypeInfo[] parameters) throws SemanticException { + public GenericUDAFEvaluator getEvaluator(TypeInfo[] parameters) + throws SemanticException { Class udafEvaluatorClass = udaf.getResolver() .getEvaluatorClass(Arrays.asList(parameters)); diff --git ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFCount.java ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFCount.java index 0054664..156f0f3 100644 --- ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFCount.java +++ ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFCount.java @@ -18,7 +18,7 @@ package org.apache.hadoop.hive.ql.udf.generic; import org.apache.hadoop.hive.ql.exec.Description; -import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; +import org.apache.hadoop.hive.ql.exec.UDFArgumentException; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.parse.SemanticException; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; @@ -30,16 +30,41 @@ import org.apache.hadoop.io.LongWritable; /** * This class implements the COUNT aggregation function as in SQL. */ -@Description(name = "count", value = "_FUNC_(x) - Returns the count") -public class GenericUDAFCount implements GenericUDAFResolver { +@Description(name = "count", + value = "_FUNC_(*) - Returns the total number of retrieved rows, including " + + "rows containing NULL values.\n" + + + "_FUNC_(expr) - Returns the number of rows for which the supplied " + + "expression is non-NULL.\n" + + + "_FUNC_(DISTINCT expr[, expr...]) - Returns the number of rows for " + + "which the supplied expression(s) are unique and non-NULL.") +public class GenericUDAFCount extends AbstractGenericUDAFResolver { @Override - public GenericUDAFEvaluator getEvaluator(TypeInfo[] parameters) throws SemanticException { - if (parameters.length != 1) { - throw new UDFArgumentTypeException(parameters.length - 1, - "Exactly one argument is expected."); + public GenericUDAFEvaluator getEvaluator(TypeInfo[] parameters) + throws SemanticException { + // This method implementation is preserved for backward compatibility. + return getEvaluator(parameters, false, false); + } + + @Override + public GenericUDAFEvaluator getEvaluator(TypeInfo[] parameters, + boolean isDistinct, boolean isAllColumns) throws SemanticException { + + if (parameters.length == 0) { + if (!isAllColumns) { + throw new UDFArgumentException("Argument expected"); + } + assert !isDistinct : "DISTINCT not supported with *"; + } else { + if (parameters.length > 1 && !isDistinct) { + throw new UDFArgumentException("DISTINCT keyword must be specified"); + } + assert !isAllColumns : "* not supported in expression list"; } - return new GenericUDAFCountEvaluator(); + + return new GenericUDAFCountEvaluator().setCountAllColumns(isAllColumns); } /** @@ -47,18 +72,25 @@ public class GenericUDAFCount implements GenericUDAFResolver { * */ public static class GenericUDAFCountEvaluator extends GenericUDAFEvaluator { - private ObjectInspector inputOI; + private boolean countAllColumns; + private LongObjectInspector partialCountAggOI; private LongWritable result; @Override - public ObjectInspector init(Mode m, ObjectInspector[] parameters) throws HiveException { + public ObjectInspector init(Mode m, ObjectInspector[] parameters) + throws HiveException { super.init(m, parameters); - assert (parameters.length == 1); - inputOI = parameters[0]; + partialCountAggOI = + PrimitiveObjectInspectorFactory.writableLongObjectInspector; result = new LongWritable(0); return PrimitiveObjectInspectorFactory.writableLongObjectInspector; } + private GenericUDAFCountEvaluator setCountAllColumns(boolean countAllCols) { + countAllColumns = countAllCols; + return this; + } + /** class for storing count value. */ static class CountAgg implements AggregationBuffer { long value; @@ -66,9 +98,9 @@ public class GenericUDAFCount implements GenericUDAFResolver { @Override public AggregationBuffer getNewAggregationBuffer() throws HiveException { - CountAgg result = new CountAgg(); - reset(result); - return result; + CountAgg buffer = new CountAgg(); + reset(buffer); + return buffer; } @Override @@ -77,17 +109,31 @@ public class GenericUDAFCount implements GenericUDAFResolver { } @Override - public void iterate(AggregationBuffer agg, Object[] parameters) throws HiveException { - assert (parameters.length == 1); - if (parameters[0] != null) { + public void iterate(AggregationBuffer agg, Object[] parameters) + throws HiveException { + if (countAllColumns) { + assert parameters.length == 0; ((CountAgg) agg).value++; + } else { + assert parameters.length > 0; + boolean countThisRow = true; + for (Object nextParam : parameters) { + if (nextParam == null) { + countThisRow = false; + break; + } + } + if (countThisRow) { + ((CountAgg) agg).value++; + } } } @Override - public void merge(AggregationBuffer agg, Object partial) throws HiveException { + public void merge(AggregationBuffer agg, Object partial) + throws HiveException { if (partial != null) { - long p = ((LongObjectInspector) inputOI).get(partial); + long p = partialCountAggOI.get(partial); ((CountAgg) agg).value += p; } } @@ -102,7 +148,5 @@ public class GenericUDAFCount implements GenericUDAFResolver { public Object terminatePartial(AggregationBuffer agg) throws HiveException { return terminate(agg); } - } - } diff --git ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFMax.java ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFMax.java index 6785687..b9a295c 100644 --- ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFMax.java +++ ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFMax.java @@ -19,25 +19,24 @@ package org.apache.hadoop.hive.ql.udf.generic; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; -import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.parse.SemanticException; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; -import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.ObjectInspectorCopyOption; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; @Description(name = "max", value = "_FUNC_(expr) - Returns the maximum value of expr") -public class GenericUDAFMax implements GenericUDAFResolver { +public class GenericUDAFMax extends AbstractGenericUDAFResolver { static final Log LOG = LogFactory.getLog(GenericUDAFMax.class.getName()); @Override public GenericUDAFEvaluator getEvaluator(TypeInfo[] parameters) - throws SemanticException { + throws SemanticException { if (parameters.length != 1) { throw new UDFArgumentTypeException(parameters.length - 1, "Exactly one argument is expected."); diff --git ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFMin.java ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFMin.java index 051f3a1..107d663 100644 --- ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFMin.java +++ ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFMin.java @@ -19,25 +19,24 @@ package org.apache.hadoop.hive.ql.udf.generic; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; -import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.parse.SemanticException; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; -import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.ObjectInspectorCopyOption; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; @Description(name = "min", value = "_FUNC_(expr) - Returns the minimum value of expr") -public class GenericUDAFMin implements GenericUDAFResolver { +public class GenericUDAFMin extends AbstractGenericUDAFResolver { static final Log LOG = LogFactory.getLog(GenericUDAFMin.class.getName()); @Override public GenericUDAFEvaluator getEvaluator(TypeInfo[] parameters) - throws SemanticException { + throws SemanticException { if (parameters.length != 1) { throw new UDFArgumentTypeException(parameters.length - 1, "Exactly one argument is expected."); diff --git ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFResolver.java ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFResolver.java index 9888b52..c01d802 100644 --- ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFResolver.java +++ ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFResolver.java @@ -24,28 +24,67 @@ import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; /** * A Generic User-defined aggregation function (GenericUDAF) for the use with * Hive. - * + * * GenericUDAFResolver is used at compile time. We use GenericUDAFResolver to * find out the GenericUDAFEvaluator for the parameter types. - * + * */ public interface GenericUDAFResolver { /** * Get the evaluator for the parameter types. - * + * + * The reason that this function returns an object instead of a class is + * because it is possible that the object needs some configuration (that can + * be serialized). In that case the class of the object has to implement the + * Serializable interface. At execution time, we will deserialize the object + * from the plan and use it to evaluate the aggregations. + *

+ * If the class of the object does not implement Serializable, then we will + * create a new instance of the class at execution time. + *

+ * @param parameters + * The types of the parameters. We need the type information to know + * which evaluator class to use. + * @throws SemanticException + * @deprecated Use {@link #getEvaluator(TypeInfo[], boolean, boolean)} + * instead. + */ + @Deprecated + GenericUDAFEvaluator getEvaluator(TypeInfo[] parameters) + throws SemanticException; + + /** + * Get the evaluator for the parameter types. + * * The reason that this function returns an object instead of a class is - * because it's possible that the object needs some configuration (that can be - * serialized). In that case the class of the object has to implement the + * because it is possible that the object needs some configuration (that can + * be serialized). In that case the class of the object has to implement the * Serializable interface. At execution time, we will deserialize the object * from the plan and use it to evaluate the aggregations. - * + *

* If the class of the object does not implement Serializable, then we will * create a new instance of the class at execution time. - * + *

+ * This method differs from {@link #getEvaluator(TypeInfo[])} in that it + * allows the resolver to determine the appropriate evaluator match based + * on extra information regarding the specification of DISTINCT + * qualifier for the parameter list, or if the call was invoked using the + * func(*) signature instead of an explicit parameter list. These + * hints are provided via the boolean flags isDistinct and + * isAllColumns. + * * @param parameters * The types of the parameters. We need the type information to know * which evaluator class to use. + * @param isDistinct + * A boolean flag indicating if the parameters were qualified by + * DISTINCT keyword. + * @param isAllColumns + * A boolean flag indicating that the special argument * + * was supplied instead of a regular parameter list. + * @throws SemanticException */ - GenericUDAFEvaluator getEvaluator(TypeInfo[] parameters) throws SemanticException; + GenericUDAFEvaluator getEvaluator(TypeInfo[] parameters, + boolean isDistinct, boolean isAllColumns) throws SemanticException; } diff --git ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFSum.java ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFSum.java index ce97afd..5c4b3bf 100644 --- ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFSum.java +++ ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFSum.java @@ -38,12 +38,13 @@ import org.apache.hadoop.util.StringUtils; * */ @Description(name = "sum", value = "_FUNC_(x) - Returns the sum of a set of numbers") -public class GenericUDAFSum implements GenericUDAFResolver { +public class GenericUDAFSum extends AbstractGenericUDAFResolver { static final Log LOG = LogFactory.getLog(GenericUDAFSum.class.getName()); @Override - public GenericUDAFEvaluator getEvaluator(TypeInfo[] parameters) throws SemanticException { + public GenericUDAFEvaluator getEvaluator(TypeInfo[] parameters) + throws SemanticException { if (parameters.length != 1) { throw new UDFArgumentTypeException(parameters.length - 1, "Exactly one argument is expected."); diff --git ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFVariance.java ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFVariance.java index c19857a..8c70fb7 100644 --- ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFVariance.java +++ ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFVariance.java @@ -43,16 +43,17 @@ import org.apache.hadoop.util.StringUtils; /** * Compute the variance. This class is extended by: GenericUDAFVarianceSample * GenericUDAFStd GenericUDAFStdSample - * + * */ @Description(name = "variance,var_pop", value = "_FUNC_(x) - Returns the variance of a set of numbers") -public class GenericUDAFVariance implements GenericUDAFResolver { +public class GenericUDAFVariance extends AbstractGenericUDAFResolver { static final Log LOG = LogFactory.getLog(GenericUDAFVariance.class.getName()); @Override - public GenericUDAFEvaluator getEvaluator(TypeInfo[] parameters) throws SemanticException { + public GenericUDAFEvaluator getEvaluator(TypeInfo[] parameters) + throws SemanticException { if (parameters.length != 1) { throw new UDFArgumentTypeException(parameters.length - 1, "Exactly one argument is expected."); @@ -83,15 +84,15 @@ public class GenericUDAFVariance implements GenericUDAFResolver { /** * Evaluate the variance using the following modification of the formula from * The Art of Computer Programming, vol. 2, p. 232: - * + * * variance = variance1 + variance2 + n*alpha^2 + m*betha^2 - * + * * where: - variance is sum[x-avg^2] (this is actually n times the variance) * and is updated at every step. - n is the count of elements in chunk1 - m is * the count of elements in chunk2 - alpha = avg-a - betha = avg-b - avg is * the the average of all elements from both chunks - a is the average of * elements in chunk1 - b is the average of elements in chunk2 - * + * */ public static class GenericUDAFVarianceEvaluator extends GenericUDAFEvaluator { diff --git ql/src/java/org/apache/hadoop/hive/ql/udf/generic/package-info.java ql/src/java/org/apache/hadoop/hive/ql/udf/generic/package-info.java new file mode 100644 index 0000000..b0e2e82 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/udf/generic/package-info.java @@ -0,0 +1,23 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * This package contains the generic UDAF framework along with implementation + * of the core UDAFs. + */ +package org.apache.hadoop.hive.ql.udf.generic; diff --git ql/src/test/queries/clientpositive/udf_count.q ql/src/test/queries/clientpositive/udf_count.q index 2d1510f..625140c 100644 --- ql/src/test/queries/clientpositive/udf_count.q +++ ql/src/test/queries/clientpositive/udf_count.q @@ -1,2 +1,9 @@ DESCRIBE FUNCTION count; DESCRIBE FUNCTION EXTENDED count; + +SELECT count(key) FROM src; +SELECT count(DISTINCT key) FROM src; +SELECT count(DISTINCT key, value) FROM src; +SELECT count(*) FROM src; +SELECT count(1) FROM src; + diff --git ql/src/test/results/clientpositive/udf_count.q.out ql/src/test/results/clientpositive/udf_count.q.out index 2adffd8..172d51a 100644 --- ql/src/test/results/clientpositive/udf_count.q.out +++ ql/src/test/results/clientpositive/udf_count.q.out @@ -2,9 +2,58 @@ PREHOOK: query: DESCRIBE FUNCTION count PREHOOK: type: DESCFUNCTION POSTHOOK: query: DESCRIBE FUNCTION count POSTHOOK: type: DESCFUNCTION -count(x) - Returns the count +count(*) - Returns the total number of retrieved rows, including rows containing NULL values. +count(expr) - Returns the number of rows for which the supplied expression is non-NULL. +count(DISTINCT expr[, expr...]) - Returns the number of rows for which the supplied expression(s) are unique and non-NULL. PREHOOK: query: DESCRIBE FUNCTION EXTENDED count PREHOOK: type: DESCFUNCTION POSTHOOK: query: DESCRIBE FUNCTION EXTENDED count POSTHOOK: type: DESCFUNCTION -count(x) - Returns the count +count(*) - Returns the total number of retrieved rows, including rows containing NULL values. +count(expr) - Returns the number of rows for which the supplied expression is non-NULL. +count(DISTINCT expr[, expr...]) - Returns the number of rows for which the supplied expression(s) are unique and non-NULL. +PREHOOK: query: SELECT count(key) FROM src +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: file:/Users/arvind/work/src/hive/trunk/build/ql/scratchdir/hive_2010-05-25_15-03-57_444_8268094848386405053/10000 +POSTHOOK: query: SELECT count(key) FROM src +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: file:/Users/arvind/work/src/hive/trunk/build/ql/scratchdir/hive_2010-05-25_15-03-57_444_8268094848386405053/10000 +500 +PREHOOK: query: SELECT count(DISTINCT key) FROM src +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: file:/Users/arvind/work/src/hive/trunk/build/ql/scratchdir/hive_2010-05-25_15-04-03_027_2280178607671255545/10000 +POSTHOOK: query: SELECT count(DISTINCT key) FROM src +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: file:/Users/arvind/work/src/hive/trunk/build/ql/scratchdir/hive_2010-05-25_15-04-03_027_2280178607671255545/10000 +309 +PREHOOK: query: SELECT count(DISTINCT key, value) FROM src +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: file:/Users/arvind/work/src/hive/trunk/build/ql/scratchdir/hive_2010-05-25_15-04-07_470_7969389937698436561/10000 +POSTHOOK: query: SELECT count(DISTINCT key, value) FROM src +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: file:/Users/arvind/work/src/hive/trunk/build/ql/scratchdir/hive_2010-05-25_15-04-07_470_7969389937698436561/10000 +309 +PREHOOK: query: SELECT count(*) FROM src +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: file:/Users/arvind/work/src/hive/trunk/build/ql/scratchdir/hive_2010-05-25_15-04-11_941_5092558765566594685/10000 +POSTHOOK: query: SELECT count(*) FROM src +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: file:/Users/arvind/work/src/hive/trunk/build/ql/scratchdir/hive_2010-05-25_15-04-11_941_5092558765566594685/10000 +500 +PREHOOK: query: SELECT count(1) FROM src +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: file:/Users/arvind/work/src/hive/trunk/build/ql/scratchdir/hive_2010-05-25_15-04-16_880_218108047003625408/10000 +POSTHOOK: query: SELECT count(1) FROM src +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: file:/Users/arvind/work/src/hive/trunk/build/ql/scratchdir/hive_2010-05-25_15-04-16_880_218108047003625408/10000 +500