diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java index 6b28be5..33bbdff 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java @@ -608,7 +608,8 @@ public final class FunctionRegistry { * @return The UDAF evaluator */ public static GenericUDAFEvaluator getGenericUDAFEvaluator(String name, - List argumentTypeInfos) throws SemanticException { + List argumentTypeInfos, boolean isDistinct, + boolean isAllColumns) throws SemanticException { GenericUDAFResolver udaf = getGenericUDAFResolver(name); if (udaf == null) { return null; @@ -618,7 +619,7 @@ public final class FunctionRegistry { for (int i = 0; i < parameters.length; i++) { parameters[i] = argumentTypeInfos.get(i); } - return udaf.getEvaluator(parameters); + return udaf.getEvaluator(parameters, isDistinct, isAllColumns); } /** diff --git ql/src/java/org/apache/hadoop/hive/ql/parse/Hive.g ql/src/java/org/apache/hadoop/hive/ql/parse/Hive.g index 956801a..afb0432 100644 --- ql/src/java/org/apache/hadoop/hive/ql/parse/Hive.g +++ ql/src/java/org/apache/hadoop/hive/ql/parse/Hive.g @@ -27,6 +27,7 @@ TOK_ALLCOLREF; TOK_TABLE_OR_COL; TOK_FUNCTION; TOK_FUNCTIONDI; +TOK_FUNCTIONSTAR; TOK_WHERE; TOK_OP_EQ; TOK_OP_NE; @@ -178,15 +179,15 @@ catch (RecognitionException e) { // starting rule statement - : explainStatement EOF - | execStatement EOF - ; + : explainStatement EOF + | execStatement EOF + ; explainStatement @init { msgs.push("explain statement"); } @after { msgs.pop(); } - : KW_EXPLAIN (isExtended=KW_EXTENDED)? execStatement -> ^(TOK_EXPLAIN execStatement $isExtended?) - ; + : KW_EXPLAIN (isExtended=KW_EXTENDED)? execStatement -> ^(TOK_EXPLAIN execStatement $isExtended?) + ; execStatement @init { msgs.push("statement"); } @@ -375,16 +376,16 @@ alterStatementSuffixSerdeProperties alterStatementSuffixFileFormat @init {msgs.push("alter fileformat statement"); } @after {msgs.pop(); } - :name=Identifier KW_SET KW_FILEFORMAT fileFormat - -> ^(TOK_ALTERTABLE_FILEFORMAT $name fileFormat) - ; + :name=Identifier KW_SET KW_FILEFORMAT fileFormat + -> ^(TOK_ALTERTABLE_FILEFORMAT $name fileFormat) + ; alterStatementSuffixClusterbySortby @init {msgs.push("alter cluster by sort by statement");} @after{msgs.pop();} - :name=Identifier tableBuckets - ->^(TOK_ALTERTABLE_CLUSTER_SORT $name tableBuckets) - ; + :name=Identifier tableBuckets + ->^(TOK_ALTERTABLE_CLUSTER_SORT $name tableBuckets) + ; fileFormat @init { msgs.push("file format specification"); } @@ -859,7 +860,7 @@ selectClause @init { msgs.push("select clause"); } @after { msgs.pop(); } : - KW_SELECT hintClause? (((KW_ALL | dist=KW_DISTINCT)? selectList) + KW_SELECT hintClause? (((KW_ALL | dist=KW_DISTINCT)? selectList) | (transform=KW_TRANSFORM selectTrfmClause)) -> {$transform == null && $dist == null}? ^(TOK_SELECT hintClause? selectList) -> {$transform == null && $dist != null}? ^(TOK_SELECTDI hintClause? selectList) @@ -874,7 +875,7 @@ selectList : selectItem ( COMMA selectItem )* -> selectItem+ ; - + selectTrfmClause @init { msgs.push("transform clause"); } @after { msgs.pop(); } @@ -1043,9 +1044,9 @@ joinToken lateralView @init {msgs.push("lateral view"); } @after {msgs.pop(); } - : - KW_LATERAL KW_VIEW function tableAlias KW_AS Identifier (COMMA Identifier)* -> ^(TOK_LATERAL_VIEW ^(TOK_SELECT ^(TOK_SELEXPR function Identifier+ tableAlias))) - ; + : + KW_LATERAL KW_VIEW function tableAlias KW_AS Identifier (COMMA Identifier)* -> ^(TOK_LATERAL_VIEW ^(TOK_SELECT ^(TOK_SELEXPR function Identifier+ tableAlias))) + ; tableAlias @init {msgs.push("table alias"); } @@ -1162,10 +1163,13 @@ function : functionName LPAREN - (dist=KW_DISTINCT)? - (expression (COMMA expression)*)? - RPAREN -> {$dist == null}? ^(TOK_FUNCTION functionName (expression+)?) - -> ^(TOK_FUNCTIONDI functionName (expression+)?) + ( + (star=STAR) + | (dist=KW_DISTINCT)? (expression (COMMA expression)*)? + ) + RPAREN -> {$star != null}? ^(TOK_FUNCTIONSTAR functionName) + -> {$dist == null}? ^(TOK_FUNCTION functionName (expression+)?) + -> ^(TOK_FUNCTIONDI functionName (expression+)?) ; functionName diff --git ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java index 29d93c1..cee352a 100644 --- ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java +++ ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java @@ -313,8 +313,10 @@ public class SemanticAnalyzer extends BaseSemanticAnalyzer { */ private void doPhase1GetAllAggregations(ASTNode expressionTree, HashMap aggregations) { - if (expressionTree.getToken().getType() == HiveParser.TOK_FUNCTION - || expressionTree.getToken().getType() == HiveParser.TOK_FUNCTIONDI) { + int exprTokenType = expressionTree.getToken().getType(); + if (exprTokenType == HiveParser.TOK_FUNCTION + || exprTokenType == HiveParser.TOK_FUNCTIONDI + || exprTokenType == HiveParser.TOK_FUNCTIONSTAR) { assert (expressionTree.getChildCount() != 0); if (expressionTree.getChild(0).getType() == HiveParser.Identifier) { String functionName = unescapeIdentifier(expressionTree.getChild(0) @@ -1709,7 +1711,9 @@ public class SemanticAnalyzer extends BaseSemanticAnalyzer { ASTNode udtfExpr = (ASTNode) selExprList.getChild(posn).getChild(0); GenericUDTF genericUDTF = null; - if (udtfExpr.getType() == HiveParser.TOK_FUNCTION) { + int udtfExprType = udtfExpr.getType(); + if (udtfExprType == HiveParser.TOK_FUNCTION + || udtfExprType == HiveParser.TOK_FUNCTIONSTAR) { String funcName = TypeCheckProcFactory.DefaultExprProcessor .getFunctionText(udtfExpr, true); FunctionInfo fi = FunctionRegistry.getFunctionInfo(funcName); @@ -1918,11 +1922,12 @@ public class SemanticAnalyzer extends BaseSemanticAnalyzer { * for each GroupBy aggregation. */ static GenericUDAFEvaluator getGenericUDAFEvaluator(String aggName, - ArrayList aggParameters, ASTNode aggTree) + ArrayList aggParameters, ASTNode aggTree, + boolean isDistinct, boolean isAllColumns) throws SemanticException { ArrayList originalParameterTypeInfos = getTypeInfo(aggParameters); GenericUDAFEvaluator result = FunctionRegistry.getGenericUDAFEvaluator( - aggName, originalParameterTypeInfos); + aggName, originalParameterTypeInfos, isDistinct, isAllColumns); if (null == result) { String reason = "Looking for UDAF Evaluator\"" + aggName + "\" with parameters " + originalParameterTypeInfos; @@ -2066,9 +2071,10 @@ public class SemanticAnalyzer extends BaseSemanticAnalyzer { } boolean isDistinct = value.getType() == HiveParser.TOK_FUNCTIONDI; + boolean isAllColumns = value.getType() == HiveParser.TOK_FUNCTIONSTAR; Mode amode = groupByDescModeToUDAFMode(mode, isDistinct); GenericUDAFEvaluator genericUDAFEvaluator = getGenericUDAFEvaluator( - aggName, aggParameters, value); + aggName, aggParameters, value, isDistinct, isAllColumns); assert (genericUDAFEvaluator != null); GenericUDAFInfo udaf = getGenericUDAFInfo(genericUDAFEvaluator, amode, aggParameters); @@ -2188,12 +2194,13 @@ public class SemanticAnalyzer extends BaseSemanticAnalyzer { .getIsPartitionCol())); } boolean isDistinct = (value.getType() == HiveParser.TOK_FUNCTIONDI); + boolean isAllColumns = value.getType() == HiveParser.TOK_FUNCTIONSTAR; Mode amode = groupByDescModeToUDAFMode(mode, isDistinct); GenericUDAFEvaluator genericUDAFEvaluator = null; // For distincts, partial aggregations have not been done if (distPartAgg) { genericUDAFEvaluator = getGenericUDAFEvaluator(aggName, aggParameters, - value); + value, isDistinct, isAllColumns); assert (genericUDAFEvaluator != null); genericUDAFEvaluators.put(entry.getKey(), genericUDAFEvaluator); } else { @@ -2305,10 +2312,11 @@ public class SemanticAnalyzer extends BaseSemanticAnalyzer { } boolean isDistinct = value.getType() == HiveParser.TOK_FUNCTIONDI; + boolean isAllColumns = value.getType() == HiveParser.TOK_FUNCTIONSTAR; Mode amode = groupByDescModeToUDAFMode(mode, isDistinct); GenericUDAFEvaluator genericUDAFEvaluator = getGenericUDAFEvaluator( - aggName, aggParameters, value); + aggName, aggParameters, value, isDistinct, isAllColumns); assert (genericUDAFEvaluator != null); GenericUDAFInfo udaf = getGenericUDAFInfo(genericUDAFEvaluator, amode, aggParameters); @@ -2591,6 +2599,7 @@ public class SemanticAnalyzer extends BaseSemanticAnalyzer { String aggName = value.getChild(0).getText(); boolean isDistinct = value.getType() == HiveParser.TOK_FUNCTIONDI; + boolean isStar = value.getType() == HiveParser.TOK_FUNCTIONSTAR; Mode amode = groupByDescModeToUDAFMode(mode, isDistinct); GenericUDAFEvaluator genericUDAFEvaluator = genericUDAFEvaluators .get(entry.getKey()); @@ -3183,8 +3192,8 @@ public class SemanticAnalyzer extends BaseSemanticAnalyzer { dpCtx = new DynamicPartitionCtx(dest_tab, partSpec, conf.getVar(HiveConf.ConfVars.DEFAULTPARTITIONNAME), conf.getIntVar(HiveConf.ConfVars.DYNAMICPARTITIONMAXPARTSPERNODE)); - qbm.setDPCtx(dest, dpCtx); - } + qbm.setDPCtx(dest, dpCtx); + } if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.DYNAMICPARTITIONING)) { // allow DP // TODO: we should support merge files for dynamically generated partitions later @@ -3202,7 +3211,7 @@ public class SemanticAnalyzer extends BaseSemanticAnalyzer { } if (dpCtx.getSPPath() != null) { dest_path = new Path(dest_tab.getPath(), dpCtx.getSPPath()); - } + } if ((dest_tab.getNumBuckets() > 0) && (conf.getBoolVar(HiveConf.ConfVars.HIVEENFORCEBUCKETING))) { dpCtx.setNumBuckets(dest_tab.getNumBuckets()); @@ -3414,10 +3423,10 @@ public class SemanticAnalyzer extends BaseSemanticAnalyzer { queryTmpdir, table_desc, conf.getBoolVar(HiveConf.ConfVars.COMPRESSRESULT), - currentTableId, - rsCtx.isMultiFileSpray(), - rsCtx.getNumFiles(), - rsCtx.getTotalFiles(), + currentTableId, + rsCtx.isMultiFileSpray(), + rsCtx.getNumFiles(), + rsCtx.getTotalFiles(), rsCtx.getPartnCols(), dpCtx), fsRS, input), inputRR); diff --git ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFAverage.java ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFAverage.java index 0ef4734..dfa98d2 100644 --- ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFAverage.java +++ ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFAverage.java @@ -50,7 +50,8 @@ public class GenericUDAFAverage implements GenericUDAFResolver { static final Log LOG = LogFactory.getLog(GenericUDAFAverage.class.getName()); @Override - public GenericUDAFEvaluator getEvaluator(TypeInfo[] parameters) + public GenericUDAFEvaluator getEvaluator(TypeInfo[] parameters, + boolean isDistinct, boolean isAllColumns) throws SemanticException { if (parameters.length != 1) { throw new UDFArgumentTypeException(parameters.length - 1, diff --git ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFBridge.java ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFBridge.java index 0ff41f7..4fcf0dd 100644 --- ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFBridge.java +++ ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFBridge.java @@ -51,7 +51,8 @@ public class GenericUDAFBridge implements GenericUDAFResolver { } @Override - public GenericUDAFEvaluator getEvaluator(TypeInfo[] parameters) throws SemanticException { + public GenericUDAFEvaluator getEvaluator(TypeInfo[] parameters, + boolean isDistinct, boolean isAllColumns) throws SemanticException { Class udafEvaluatorClass = udaf.getResolver() .getEvaluatorClass(Arrays.asList(parameters)); diff --git ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFCount.java ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFCount.java index 0054664..3619384 100644 --- ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFCount.java +++ ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFCount.java @@ -18,7 +18,7 @@ package org.apache.hadoop.hive.ql.udf.generic; import org.apache.hadoop.hive.ql.exec.Description; -import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; +import org.apache.hadoop.hive.ql.exec.UDFArgumentException; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.parse.SemanticException; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; @@ -30,16 +30,33 @@ import org.apache.hadoop.io.LongWritable; /** * This class implements the COUNT aggregation function as in SQL. */ -@Description(name = "count", value = "_FUNC_(x) - Returns the count") +@Description(name = "count", + value = "_FUNC_(*) - Returns the total number of rows from the table;\n" + + "_FUNC_([DISTINCT] expr1[, expr2...]) - " + + "Returns count of rows where given expression(s) have non-null" + + " value. The DISTINCT keyword must be specified if more than one" + + " expression is given as argument to _FUNC_. Specifying DISTINCT" + + " keyword returns the number of rows where all evaluted" + + " expressions have different non-null values.") public class GenericUDAFCount implements GenericUDAFResolver { @Override - public GenericUDAFEvaluator getEvaluator(TypeInfo[] parameters) throws SemanticException { - if (parameters.length != 1) { - throw new UDFArgumentTypeException(parameters.length - 1, - "Exactly one argument is expected."); + public GenericUDAFEvaluator getEvaluator(TypeInfo[] parameters, + boolean isDistinct, boolean isAllColumns) throws SemanticException { + + if (parameters.length == 0) { + if (!isAllColumns) { + throw new UDFArgumentException("Argument expected"); + } + assert !isDistinct : "DISTINCT not supported with *"; + } else { + if (parameters.length > 1 && !isDistinct) { + throw new UDFArgumentException("DISTINCT keyword must be specified"); + } + assert !isAllColumns : "* not supported in expression list"; } - return new GenericUDAFCountEvaluator(); + + return new GenericUDAFCountEvaluator().setCountAllColumns(isAllColumns); } /** @@ -47,18 +64,25 @@ public class GenericUDAFCount implements GenericUDAFResolver { * */ public static class GenericUDAFCountEvaluator extends GenericUDAFEvaluator { - private ObjectInspector inputOI; + private boolean countAllColumns; + private LongObjectInspector partialCountAggOI; private LongWritable result; @Override - public ObjectInspector init(Mode m, ObjectInspector[] parameters) throws HiveException { + public ObjectInspector init(Mode m, ObjectInspector[] parameters) + throws HiveException { super.init(m, parameters); - assert (parameters.length == 1); - inputOI = parameters[0]; + partialCountAggOI = + PrimitiveObjectInspectorFactory.writableLongObjectInspector; result = new LongWritable(0); return PrimitiveObjectInspectorFactory.writableLongObjectInspector; } + private GenericUDAFCountEvaluator setCountAllColumns(boolean countAllCols) { + countAllColumns = countAllCols; + return this; + } + /** class for storing count value. */ static class CountAgg implements AggregationBuffer { long value; @@ -66,9 +90,9 @@ public class GenericUDAFCount implements GenericUDAFResolver { @Override public AggregationBuffer getNewAggregationBuffer() throws HiveException { - CountAgg result = new CountAgg(); - reset(result); - return result; + CountAgg buffer = new CountAgg(); + reset(buffer); + return buffer; } @Override @@ -77,17 +101,31 @@ public class GenericUDAFCount implements GenericUDAFResolver { } @Override - public void iterate(AggregationBuffer agg, Object[] parameters) throws HiveException { - assert (parameters.length == 1); - if (parameters[0] != null) { + public void iterate(AggregationBuffer agg, Object[] parameters) + throws HiveException { + if (countAllColumns) { + assert parameters.length == 0; ((CountAgg) agg).value++; + } else { + assert parameters.length > 0; + boolean countThisRow = true; + for (Object nextParam : parameters) { + if (nextParam == null) { + countThisRow = false; + break; + } + } + if (countThisRow) { + ((CountAgg) agg).value++; + } } } @Override - public void merge(AggregationBuffer agg, Object partial) throws HiveException { + public void merge(AggregationBuffer agg, Object partial) + throws HiveException { if (partial != null) { - long p = ((LongObjectInspector) inputOI).get(partial); + long p = partialCountAggOI.get(partial); ((CountAgg) agg).value += p; } } @@ -102,7 +140,5 @@ public class GenericUDAFCount implements GenericUDAFResolver { public Object terminatePartial(AggregationBuffer agg) throws HiveException { return terminate(agg); } - } - } diff --git ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFMax.java ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFMax.java index 6785687..c0c87d6 100644 --- ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFMax.java +++ ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFMax.java @@ -19,13 +19,12 @@ package org.apache.hadoop.hive.ql.udf.generic; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; -import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.parse.SemanticException; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; -import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.ObjectInspectorCopyOption; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; @@ -36,8 +35,8 @@ public class GenericUDAFMax implements GenericUDAFResolver { static final Log LOG = LogFactory.getLog(GenericUDAFMax.class.getName()); @Override - public GenericUDAFEvaluator getEvaluator(TypeInfo[] parameters) - throws SemanticException { + public GenericUDAFEvaluator getEvaluator(TypeInfo[] parameters, + boolean isDistinct, boolean isAllColumns) throws SemanticException { if (parameters.length != 1) { throw new UDFArgumentTypeException(parameters.length - 1, "Exactly one argument is expected."); diff --git ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFMin.java ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFMin.java index 051f3a1..12187f2 100644 --- ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFMin.java +++ ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFMin.java @@ -19,13 +19,12 @@ package org.apache.hadoop.hive.ql.udf.generic; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; -import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.parse.SemanticException; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; -import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.ObjectInspectorCopyOption; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; @@ -36,8 +35,8 @@ public class GenericUDAFMin implements GenericUDAFResolver { static final Log LOG = LogFactory.getLog(GenericUDAFMin.class.getName()); @Override - public GenericUDAFEvaluator getEvaluator(TypeInfo[] parameters) - throws SemanticException { + public GenericUDAFEvaluator getEvaluator(TypeInfo[] parameters, + boolean isDistinct, boolean isAllColumns) throws SemanticException { if (parameters.length != 1) { throw new UDFArgumentTypeException(parameters.length - 1, "Exactly one argument is expected."); diff --git ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFResolver.java ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFResolver.java index 9888b52..bd860cb 100644 --- ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFResolver.java +++ ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFResolver.java @@ -24,28 +24,35 @@ import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; /** * A Generic User-defined aggregation function (GenericUDAF) for the use with * Hive. - * + * * GenericUDAFResolver is used at compile time. We use GenericUDAFResolver to * find out the GenericUDAFEvaluator for the parameter types. - * + * */ public interface GenericUDAFResolver { /** * Get the evaluator for the parameter types. - * + * * The reason that this function returns an object instead of a class is * because it's possible that the object needs some configuration (that can be * serialized). In that case the class of the object has to implement the * Serializable interface. At execution time, we will deserialize the object * from the plan and use it to evaluate the aggregations. - * + * * If the class of the object does not implement Serializable, then we will * create a new instance of the class at execution time. - * + * * @param parameters * The types of the parameters. We need the type information to know * which evaluator class to use. + * @param isDistinct + * A boolean flag indicating if the parameters were qualified by + * DISTINCT keyword. + * @param isAllColumns + * A boolean flag indicating that the special argument * + * was supplied instead of a regular parameter list. */ - GenericUDAFEvaluator getEvaluator(TypeInfo[] parameters) throws SemanticException; + GenericUDAFEvaluator getEvaluator(TypeInfo[] parameters, + boolean isDistinct, boolean isAllColumns) throws SemanticException; } diff --git ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFStd.java ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFStd.java index 8c59b8c..da9c235 100644 --- ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFStd.java +++ ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFStd.java @@ -28,15 +28,15 @@ import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; /** * Compute the standard deviation by extending GenericUDAFVariance and * overriding the terminate() method of the evaluator. - * + * */ @Description(name = "std,stddev,stddev_pop", value = "_FUNC_(x) - Returns the standard deviation of a set of numbers") public class GenericUDAFStd extends GenericUDAFVariance { @Override - public GenericUDAFEvaluator getEvaluator(TypeInfo[] parameters) - throws SemanticException { + public GenericUDAFEvaluator getEvaluator(TypeInfo[] parameters, + boolean isDistinct, boolean isAllColumns) throws SemanticException { if (parameters.length != 1) { throw new UDFArgumentTypeException(parameters.length - 1, "Exactly one argument is expected."); @@ -67,7 +67,7 @@ public class GenericUDAFStd extends GenericUDAFVariance { /** * Compute the standard deviation by extending GenericUDAFVarianceEvaluator * and overriding the terminate() method of the evaluator. - * + * */ public static class GenericUDAFStdEvaluator extends GenericUDAFVarianceEvaluator { diff --git ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFStdSample.java ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFStdSample.java index 631915c..c3c307c 100644 --- ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFStdSample.java +++ ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFStdSample.java @@ -28,14 +28,15 @@ import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; /** * Compute the sample standard deviation by extending GenericUDAFVariance and * overriding the terminate() method of the evaluator. - * + * */ @Description(name = "stddev_samp", value = "_FUNC_(x) - Returns the sample standard deviation of a set of numbers") public class GenericUDAFStdSample extends GenericUDAFVariance { @Override - public GenericUDAFEvaluator getEvaluator(TypeInfo[] parameters) throws SemanticException { + public GenericUDAFEvaluator getEvaluator(TypeInfo[] parameters, + boolean isDistinct, boolean isAllColumns) throws SemanticException { if (parameters.length != 1) { throw new UDFArgumentTypeException(parameters.length - 1, "Exactly one argument is expected."); diff --git ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFSum.java ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFSum.java index ce97afd..30b87f7 100644 --- ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFSum.java +++ ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFSum.java @@ -43,7 +43,8 @@ public class GenericUDAFSum implements GenericUDAFResolver { static final Log LOG = LogFactory.getLog(GenericUDAFSum.class.getName()); @Override - public GenericUDAFEvaluator getEvaluator(TypeInfo[] parameters) throws SemanticException { + public GenericUDAFEvaluator getEvaluator(TypeInfo[] parameters, + boolean isDistinct, boolean isAllColumns) throws SemanticException { if (parameters.length != 1) { throw new UDFArgumentTypeException(parameters.length - 1, "Exactly one argument is expected."); diff --git ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFVariance.java ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFVariance.java index c19857a..5ca3ee4 100644 --- ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFVariance.java +++ ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFVariance.java @@ -43,7 +43,7 @@ import org.apache.hadoop.util.StringUtils; /** * Compute the variance. This class is extended by: GenericUDAFVarianceSample * GenericUDAFStd GenericUDAFStdSample - * + * */ @Description(name = "variance,var_pop", value = "_FUNC_(x) - Returns the variance of a set of numbers") @@ -52,7 +52,8 @@ public class GenericUDAFVariance implements GenericUDAFResolver { static final Log LOG = LogFactory.getLog(GenericUDAFVariance.class.getName()); @Override - public GenericUDAFEvaluator getEvaluator(TypeInfo[] parameters) throws SemanticException { + public GenericUDAFEvaluator getEvaluator(TypeInfo[] parameters, + boolean isDistinct, boolean isAllColumns) throws SemanticException { if (parameters.length != 1) { throw new UDFArgumentTypeException(parameters.length - 1, "Exactly one argument is expected."); @@ -83,15 +84,15 @@ public class GenericUDAFVariance implements GenericUDAFResolver { /** * Evaluate the variance using the following modification of the formula from * The Art of Computer Programming, vol. 2, p. 232: - * + * * variance = variance1 + variance2 + n*alpha^2 + m*betha^2 - * + * * where: - variance is sum[x-avg^2] (this is actually n times the variance) * and is updated at every step. - n is the count of elements in chunk1 - m is * the count of elements in chunk2 - alpha = avg-a - betha = avg-b - avg is * the the average of all elements from both chunks - a is the average of * elements in chunk1 - b is the average of elements in chunk2 - * + * */ public static class GenericUDAFVarianceEvaluator extends GenericUDAFEvaluator { diff --git ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFVarianceSample.java ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFVarianceSample.java index 240c819..84d15e3 100644 --- ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFVarianceSample.java +++ ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFVarianceSample.java @@ -28,15 +28,15 @@ import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; /** * Compute the sample variance by extending GenericUDAFVariance and overriding * the terminate() method of the evaluator. - * + * */ @Description(name = "var_samp", value = "_FUNC_(x) - Returns the sample variance of a set of numbers") public class GenericUDAFVarianceSample extends GenericUDAFVariance { @Override - public GenericUDAFEvaluator getEvaluator(TypeInfo[] parameters) - throws SemanticException { + public GenericUDAFEvaluator getEvaluator(TypeInfo[] parameters, + boolean isDistinct, boolean isAllColumns) throws SemanticException { if (parameters.length != 1) { throw new UDFArgumentTypeException(parameters.length - 1, "Exactly one argument is expected."); diff --git ql/src/test/queries/clientpositive/udf_count.q ql/src/test/queries/clientpositive/udf_count.q index 2d1510f..625140c 100644 --- ql/src/test/queries/clientpositive/udf_count.q +++ ql/src/test/queries/clientpositive/udf_count.q @@ -1,2 +1,9 @@ DESCRIBE FUNCTION count; DESCRIBE FUNCTION EXTENDED count; + +SELECT count(key) FROM src; +SELECT count(DISTINCT key) FROM src; +SELECT count(DISTINCT key, value) FROM src; +SELECT count(*) FROM src; +SELECT count(1) FROM src; + diff --git ql/src/test/results/clientpositive/udf_count.q.out ql/src/test/results/clientpositive/udf_count.q.out index 2adffd8..b8159c2 100644 --- ql/src/test/results/clientpositive/udf_count.q.out +++ ql/src/test/results/clientpositive/udf_count.q.out @@ -2,9 +2,56 @@ PREHOOK: query: DESCRIBE FUNCTION count PREHOOK: type: DESCFUNCTION POSTHOOK: query: DESCRIBE FUNCTION count POSTHOOK: type: DESCFUNCTION -count(x) - Returns the count +count(*) - Returns the total number of rows from the table; +count([DISTINCT] expr1[, expr2...]) - Returns count of rows where given expression(s) have non-null value. The DISTINCT keyword must be specified if more than one expression is given as argument to count. Specifying DISTINCT keyword returns the number of rows where all evaluted expressions have different non-null values. PREHOOK: query: DESCRIBE FUNCTION EXTENDED count PREHOOK: type: DESCFUNCTION POSTHOOK: query: DESCRIBE FUNCTION EXTENDED count POSTHOOK: type: DESCFUNCTION -count(x) - Returns the count +count(*) - Returns the total number of rows from the table; +count([DISTINCT] expr1[, expr2...]) - Returns count of rows where given expression(s) have non-null value. The DISTINCT keyword must be specified if more than one expression is given as argument to count. Specifying DISTINCT keyword returns the number of rows where all evaluted expressions have different non-null values. +PREHOOK: query: SELECT count(key) FROM src +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: file:/Users/arvind/work/src/hive/trunk/build/ql/scratchdir/hive_2010-05-19_18-39-22_221_5412641915651872488/10000 +POSTHOOK: query: SELECT count(key) FROM src +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: file:/Users/arvind/work/src/hive/trunk/build/ql/scratchdir/hive_2010-05-19_18-39-22_221_5412641915651872488/10000 +500 +PREHOOK: query: SELECT count(DISTINCT key) FROM src +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: file:/Users/arvind/work/src/hive/trunk/build/ql/scratchdir/hive_2010-05-19_18-39-26_889_7115178695761089481/10000 +POSTHOOK: query: SELECT count(DISTINCT key) FROM src +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: file:/Users/arvind/work/src/hive/trunk/build/ql/scratchdir/hive_2010-05-19_18-39-26_889_7115178695761089481/10000 +309 +PREHOOK: query: SELECT count(DISTINCT key, value) FROM src +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: file:/Users/arvind/work/src/hive/trunk/build/ql/scratchdir/hive_2010-05-19_18-39-30_909_8202475220890511289/10000 +POSTHOOK: query: SELECT count(DISTINCT key, value) FROM src +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: file:/Users/arvind/work/src/hive/trunk/build/ql/scratchdir/hive_2010-05-19_18-39-30_909_8202475220890511289/10000 +309 +PREHOOK: query: SELECT count(*) FROM src +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: file:/Users/arvind/work/src/hive/trunk/build/ql/scratchdir/hive_2010-05-19_18-39-35_097_1344005184348938120/10000 +POSTHOOK: query: SELECT count(*) FROM src +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: file:/Users/arvind/work/src/hive/trunk/build/ql/scratchdir/hive_2010-05-19_18-39-35_097_1344005184348938120/10000 +500 +PREHOOK: query: SELECT count(1) FROM src +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: file:/Users/arvind/work/src/hive/trunk/build/ql/scratchdir/hive_2010-05-19_18-39-39_041_2325300703291340194/10000 +POSTHOOK: query: SELECT count(1) FROM src +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: file:/Users/arvind/work/src/hive/trunk/build/ql/scratchdir/hive_2010-05-19_18-39-39_041_2325300703291340194/10000 +500