From d6145524dba72d8d5028edf74d83e8208bcb6411 Mon Sep 17 00:00:00 2001 From: Ashutosh Chauhan Date: Tue, 1 Dec 2015 14:39:13 -0800 Subject: [PATCH] HIVE-12491 : Column Statistics: 3 attribute join on a 2-source table is off --- .../stats/annotation/StatsRulesProcFactory.java | 346 +++++++++------------ .../apache/hadoop/hive/ql/stats/StatsUtils.java | 52 +++- .../apache/hadoop/hive/ql/udf/UDFDayOfMonth.java | 2 + .../org/apache/hadoop/hive/ql/udf/UDFHour.java | 2 + .../org/apache/hadoop/hive/ql/udf/UDFMinute.java | 2 + .../org/apache/hadoop/hive/ql/udf/UDFMonth.java | 4 +- .../org/apache/hadoop/hive/ql/udf/UDFSecond.java | 2 + .../apache/hadoop/hive/ql/udf/UDFWeekOfYear.java | 2 + .../org/apache/hadoop/hive/ql/udf/UDFYear.java | 2 + .../hive/ql/udf/generic/GenericUDFAddMonths.java | 1 + .../ql/udf/generic/GenericUDFArrayContains.java | 1 + .../hive/ql/udf/generic/GenericUDFBetween.java | 1 + .../hive/ql/udf/generic/GenericUDFCurrentDate.java | 2 + .../ql/udf/generic/GenericUDFCurrentTimestamp.java | 1 + .../hive/ql/udf/generic/GenericUDFCurrentUser.java | 1 + .../hive/ql/udf/generic/GenericUDFOPAnd.java | 1 + .../hive/ql/udf/generic/GenericUDFOPEqual.java | 1 + .../hive/ql/udf/generic/GenericUDFOPEqualNS.java | 1 + .../generic/GenericUDFOPEqualOrGreaterThan.java | 1 + .../udf/generic/GenericUDFOPEqualOrLessThan.java | 1 + .../ql/udf/generic/GenericUDFOPGreaterThan.java | 1 + .../hive/ql/udf/generic/GenericUDFOPLessThan.java | 1 + .../hive/ql/udf/generic/GenericUDFOPNot.java | 1 + .../hive/ql/udf/generic/GenericUDFOPNotEqual.java | 3 +- .../hive/ql/udf/generic/GenericUDFOPNotNull.java | 1 + .../hive/ql/udf/generic/GenericUDFOPNull.java | 1 + .../hadoop/hive/ql/udf/generic/GenericUDFOPOr.java | 1 + .../org/apache/hadoop/hive/ql/udf/generic/NDV.java | 27 ++ .../hadoop/hive/ql/udf/generic/UDFCurrentDB.java | 1 + .../results/clientpositive/cbo_rp_auto_join0.q.out | 16 +- .../results/clientpositive/tez/explainuser_1.q.out | 16 +- 31 files changed, 281 insertions(+), 214 deletions(-) create mode 100644 ql/src/java/org/apache/hadoop/hive/ql/udf/generic/NDV.java diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java index a8ff158..c1e314f 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java @@ -967,20 +967,6 @@ private boolean checkMapSideAggregation(GroupByOperator gop, // worst-case, hash aggregation disabled return false; } - - private long applyGBYRule(long numRows, long dvProd) { - long newNumRows = numRows; - - // to avoid divide by 2 to become 0 - if (numRows > 1) { - if (dvProd != 0) { - newNumRows = Math.min(numRows / 2, dvProd); - } else { - newNumRows = numRows / 2; - } - } - return newNumRows; - } } /** @@ -1032,170 +1018,156 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, int numAttr = 1; AnnotateStatsProcCtx aspCtx = (AnnotateStatsProcCtx) procCtx; HiveConf conf = aspCtx.getConf(); - boolean allStatsAvail = true; boolean allSatisfyPreCondition = true; for (Operator op : parents) { if (op.getStatistics() == null) { - allStatsAvail = false; + return null; } } - if (allStatsAvail) { - - for (Operator op : parents) { - if (!satisfyPrecondition(op.getStatistics())) { - allSatisfyPreCondition = false; - } + for (Operator op : parents) { + if (!satisfyPrecondition(op.getStatistics())) { + allSatisfyPreCondition = false; + break; } + } - if (allSatisfyPreCondition) { - - // statistics object that is combination of statistics from all - // relations involved in JOIN - Statistics stats = new Statistics(); - List distinctVals = Lists.newArrayList(); - int numParent = parents.size(); - Map rowCountParents = Maps.newHashMap(); - Map joinStats = Maps.newHashMap(); - Map> joinKeys = Maps.newHashMap(); - List rowCounts = Lists.newArrayList(); - - // detect if there are multiple attributes in join key - ReduceSinkOperator rsOp = (ReduceSinkOperator) jop.getParentOperators().get(0); - List keyExprs = StatsUtils.getQualifedReducerKeyNames(rsOp.getConf() + if (allSatisfyPreCondition) { + + // statistics object that is combination of statistics from all + // relations involved in JOIN + Statistics stats = new Statistics(); + int numParent = parents.size(); + Map rowCountParents = Maps.newHashMap(); + Map joinStats = Maps.newHashMap(); + Map> joinKeys = Maps.newHashMap(); + List rowCounts = Lists.newArrayList(); + + // detect if there are multiple attributes in join key + ReduceSinkOperator rsOp = (ReduceSinkOperator) jop.getParentOperators().get(0); + List keyExprs = StatsUtils.getQualifedReducerKeyNames(rsOp.getConf() + .getOutputKeyColumnNames()); + numAttr = keyExprs.size(); + + // infer PK-FK relationship in single attribute join case + long inferredRowCount = inferPKFKRelationship(numAttr, parents, jop); + // get the join keys from parent ReduceSink operators + for (int pos = 0; pos < parents.size(); pos++) { + ReduceSinkOperator parent = (ReduceSinkOperator) jop.getParentOperators().get(pos); + Statistics parentStats = parent.getStatistics(); + keyExprs = StatsUtils.getQualifedReducerKeyNames(parent.getConf() .getOutputKeyColumnNames()); - numAttr = keyExprs.size(); - // infer PK-FK relationship in single attribute join case - long inferredRowCount = inferPKFKRelationship(numAttr, parents, jop); - // get the join keys from parent ReduceSink operators - for (int pos = 0; pos < parents.size(); pos++) { - ReduceSinkOperator parent = (ReduceSinkOperator) jop.getParentOperators().get(pos); - Statistics parentStats = parent.getStatistics(); - keyExprs = StatsUtils.getQualifedReducerKeyNames(parent.getConf() - .getOutputKeyColumnNames()); - - rowCountParents.put(pos, parentStats.getNumRows()); - rowCounts.add(parentStats.getNumRows()); + rowCountParents.put(pos, parentStats.getNumRows()); + rowCounts.add(parentStats.getNumRows()); - // internal name for expressions and estimate column statistics for expression. - joinKeys.put(pos, keyExprs); + // internal name for expressions and estimate column statistics for expression. + joinKeys.put(pos, keyExprs); - // get column statistics for all output columns - joinStats.put(pos, parentStats); + // get column statistics for all output columns + joinStats.put(pos, parentStats); - // since new statistics is derived from all relations involved in - // JOIN, we need to update the state information accordingly - stats.updateColumnStatsState(parentStats.getColumnStatsState()); - } + // since new statistics is derived from all relations involved in + // JOIN, we need to update the state information accordingly + stats.updateColumnStatsState(parentStats.getColumnStatsState()); + } - // compute denominator i.e, max(V(R,Y), V(S,Y)) in case of single - // attribute join, else max(V(R,y1), V(S,y1)) * max(V(R,y2), V(S,y2)) + List distinctVals = Lists.newArrayList(); + long denom = 1; + if (inferredRowCount == -1) { + // failed to infer PK-FK relationship for row count estimation fall-back on default logic + // compute denominator max(V(R,y1), V(S,y1)) * max(V(R,y2), V(S,y2)) // in case of multi-attribute join - long denom = 1; - if (numAttr > 1) { - List perAttrDVs = Lists.newArrayList(); - for (int idx = 0; idx < numAttr; idx++) { - for (Integer i : joinKeys.keySet()) { - String col = joinKeys.get(i).get(idx); - ColStatistics cs = joinStats.get(i).getColumnStatisticsFromColName(col); - if (cs != null) { - perAttrDVs.add(cs.getCountDistint()); - } + List perAttrDVs = Lists.newArrayList(); + for (int idx = 0; idx < numAttr; idx++) { + for (Integer i : joinKeys.keySet()) { + String col = joinKeys.get(i).get(idx); + ColStatistics cs = joinStats.get(i).getColumnStatisticsFromColName(col); + if (cs != null) { + perAttrDVs.add(cs.getCountDistint()); } - distinctVals.add(getDenominator(perAttrDVs)); - perAttrDVs.clear(); } + distinctVals.add(getDenominator(perAttrDVs)); + perAttrDVs.clear(); + } - if (numAttr > numParent) { - // To avoid denominator getting larger and aggressively reducing - // number of rows, we will ease out denominator. - denom = getEasedOutDenominator(distinctVals); - } else { - for (Long l : distinctVals) { - denom = StatsUtils.safeMult(denom, l); - } - } + if (numAttr > numParent) { + // To avoid denominator getting larger and aggressively reducing + // number of rows, we will ease out denominator. + denom = StatsUtils.addWithExpDecay(distinctVals); } else { - if (numAttr == 1) { - for (Integer i : joinKeys.keySet()) { - String col = joinKeys.get(i).get(0); - ColStatistics cs = joinStats.get(i).getColumnStatisticsFromColName(col); - if (cs != null) { - distinctVals.add(cs.getCountDistint()); - } - } + for (Long l : distinctVals) { + denom = StatsUtils.safeMult(denom, l); } - denom = getDenominator(distinctVals); } + } - // Update NDV of joined columns to be min(V(R,y), V(S,y)) - updateJoinColumnsNDV(joinKeys, joinStats, numAttr); - - // column statistics from different sources are put together and - // rename based on output schema of join operator - Map colExprMap = jop.getColumnExprMap(); - RowSchema rs = jop.getSchema(); - List outColStats = Lists.newArrayList(); - for (ColumnInfo ci : rs.getSignature()) { - String key = ci.getInternalName(); - ExprNodeDesc end = colExprMap.get(key); - if (end instanceof ExprNodeColumnDesc) { - String colName = ((ExprNodeColumnDesc) end).getColumn(); - int pos = jop.getConf().getReversedExprs().get(key); - ColStatistics cs = joinStats.get(pos).getColumnStatisticsFromColName(colName); - String outColName = key; - if (cs != null) { - cs.setColumnName(outColName); - } - outColStats.add(cs); + // Update NDV of joined columns to be min(V(R,y), V(S,y)) + updateJoinColumnsNDV(joinKeys, joinStats, numAttr); + + // column statistics from different sources are put together and + // rename based on output schema of join operator + Map colExprMap = jop.getColumnExprMap(); + RowSchema rs = jop.getSchema(); + List outColStats = Lists.newArrayList(); + for (ColumnInfo ci : rs.getSignature()) { + String key = ci.getInternalName(); + ExprNodeDesc end = colExprMap.get(key); + if (end instanceof ExprNodeColumnDesc) { + String colName = ((ExprNodeColumnDesc) end).getColumn(); + int pos = jop.getConf().getReversedExprs().get(key); + ColStatistics cs = joinStats.get(pos).getColumnStatisticsFromColName(colName); + String outColName = key; + if (cs != null) { + cs.setColumnName(outColName); } + outColStats.add(cs); } + } - // update join statistics - stats.setColumnStats(outColStats); - long newRowCount = inferredRowCount !=-1 ? inferredRowCount : computeNewRowCount(rowCounts, denom); - updateStatsForJoinType(stats, newRowCount, jop, rowCountParents); - jop.setStatistics(stats); + // update join statistics + stats.setColumnStats(outColStats); + long newRowCount = inferredRowCount !=-1 ? inferredRowCount : computeNewRowCount(rowCounts, denom); + updateStatsForJoinType(stats, newRowCount, jop, rowCountParents); + jop.setStatistics(stats); - if (isDebugEnabled) { - LOG.debug("[0] STATS-" + jop.toString() + ": " + stats.extendedToString()); - } - } else { + if (isDebugEnabled) { + LOG.debug("[0] STATS-" + jop.toString() + ": " + stats.extendedToString()); + } + } else { - // worst case when there are no column statistics - float joinFactor = HiveConf.getFloatVar(conf, HiveConf.ConfVars.HIVE_STATS_JOIN_FACTOR); - int numParents = parents.size(); - List parentRows = Lists.newArrayList(); - List parentSizes = Lists.newArrayList(); - int maxRowIdx = 0; - long maxRowCount = 0; - int idx = 0; - - for (Operator op : parents) { - Statistics ps = op.getStatistics(); - long rowCount = ps.getNumRows(); - if (rowCount > maxRowCount) { - maxRowCount = rowCount; - maxRowIdx = idx; - } - parentRows.add(rowCount); - parentSizes.add(ps.getDataSize()); - idx++; + // worst case when there are no column statistics + float joinFactor = HiveConf.getFloatVar(conf, HiveConf.ConfVars.HIVE_STATS_JOIN_FACTOR); + int numParents = parents.size(); + List parentRows = Lists.newArrayList(); + List parentSizes = Lists.newArrayList(); + int maxRowIdx = 0; + long maxRowCount = 0; + int idx = 0; + + for (Operator op : parents) { + Statistics ps = op.getStatistics(); + long rowCount = ps.getNumRows(); + if (rowCount > maxRowCount) { + maxRowCount = rowCount; + maxRowIdx = idx; } + parentRows.add(rowCount); + parentSizes.add(ps.getDataSize()); + idx++; + } - long maxDataSize = parentSizes.get(maxRowIdx); - newNumRows = StatsUtils.safeMult(StatsUtils.safeMult(maxRowCount, (numParents - 1)), joinFactor); - long newDataSize = StatsUtils.safeMult(StatsUtils.safeMult(maxDataSize, (numParents - 1)), joinFactor); - Statistics wcStats = new Statistics(); - wcStats.setNumRows(newNumRows); - wcStats.setDataSize(newDataSize); - jop.setStatistics(wcStats); + long maxDataSize = parentSizes.get(maxRowIdx); + newNumRows = StatsUtils.safeMult(StatsUtils.safeMult(maxRowCount, (numParents - 1)), joinFactor); + long newDataSize = StatsUtils.safeMult(StatsUtils.safeMult(maxDataSize, (numParents - 1)), joinFactor); + Statistics wcStats = new Statistics(); + wcStats.setNumRows(newNumRows); + wcStats.setDataSize(newDataSize); + jop.setStatistics(wcStats); - if (isDebugEnabled) { - LOG.debug("[1] STATS-" + jop.toString() + ": " + wcStats.extendedToString()); - } + if (isDebugEnabled) { + LOG.debug("[1] STATS-" + jop.toString() + ": " + wcStats.extendedToString()); } } return null; @@ -1204,44 +1176,46 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, private long inferPKFKRelationship(int numAttr, List> parents, CommonJoinOperator jop) { long newNumRows = -1; - if (numAttr == 1) { - // If numAttr is 1, this means we join on one single key column. - Map parentsWithPK = getPrimaryKeyCandidates(parents); - - // We only allow one single PK. - if (parentsWithPK.size() != 1) { - LOG.debug("STATS-" + jop.toString() + ": detects none/multiple PK parents."); - return newNumRows; - } - Integer pkPos = parentsWithPK.keySet().iterator().next(); - ColStatistics csPK = parentsWithPK.values().iterator().next(); + if (numAttr != 1) { + return newNumRows; + } - // infer foreign key candidates positions - Map csFKs = getForeignKeyCandidates(parents, csPK); + // If numAttr is 1, this means we join on one single key column. + Map parentsWithPK = getPrimaryKeyCandidates(parents); - // we allow multiple foreign keys (snowflake schema) - // csfKs.size() + 1 == parents.size() means we have a single PK and all - // the rest ops are FKs. - if (csFKs.size() + 1 == parents.size()) { - newNumRows = getCardinality(parents, pkPos, csPK, csFKs, jop); + // We only allow one single PK. + if (parentsWithPK.size() != 1) { + LOG.debug("STATS-" + jop.toString() + ": detects none/multiple PK parents."); + return newNumRows; + } + Integer pkPos = parentsWithPK.keySet().iterator().next(); + ColStatistics csPK = parentsWithPK.values().iterator().next(); - // some debug information - if (isDebugEnabled) { - List parentIds = Lists.newArrayList(); + // infer foreign key candidates positions + Map csFKs = getForeignKeyCandidates(parents, csPK); - // print primary key containing parents - for (Integer i : parentsWithPK.keySet()) { - parentIds.add(parents.get(i).toString()); - } - LOG.debug("STATS-" + jop.toString() + ": PK parent id(s) - " + parentIds); - parentIds.clear(); + // we allow multiple foreign keys (snowflake schema) + // csfKs.size() + 1 == parents.size() means we have a single PK and all + // the rest ops are FKs. + if (csFKs.size() + 1 == parents.size()) { + newNumRows = getCardinality(parents, pkPos, csPK, csFKs, jop); - // print foreign key containing parents - for (Integer i : csFKs.keySet()) { - parentIds.add(parents.get(i).toString()); - } - LOG.debug("STATS-" + jop.toString() + ": FK parent id(s) - " + parentIds); + // some debug information + if (isDebugEnabled) { + List parentIds = Lists.newArrayList(); + + // print primary key containing parents + for (Integer i : parentsWithPK.keySet()) { + parentIds.add(parents.get(i).toString()); } + LOG.debug("STATS-" + jop.toString() + ": PK parent id(s) - " + parentIds); + parentIds.clear(); + + // print foreign key containing parents + for (Integer i : csFKs.keySet()) { + parentIds.add(parents.get(i).toString()); + } + LOG.debug("STATS-" + jop.toString() + ": FK parent id(s) - " + parentIds); } } return newNumRows; @@ -1425,20 +1399,6 @@ private float getSelectivityComplexTree(Operator op) { return result; } - private Long getEasedOutDenominator(List distinctVals) { - // Exponential back-off for NDVs. - // 1) Descending order sort of NDVs - // 2) denominator = NDV1 * (NDV2 ^ (1/2)) * (NDV3 ^ (1/4))) * .... - Collections.sort(distinctVals, Collections.reverseOrder()); - - long denom = distinctVals.get(0); - for (int i = 1; i < distinctVals.size(); i++) { - denom = (long) (denom * Math.pow(distinctVals.get(i), 1.0 / (1 << i))); - } - - return denom; - } - private void updateStatsForJoinType(Statistics stats, long newNumRows, CommonJoinOperator jop, Map rowCountParents) { diff --git a/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java b/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java index 71ed31c..149cbc1 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java @@ -36,6 +36,7 @@ import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj; import org.apache.hadoop.hive.metastore.api.Decimal; import org.apache.hadoop.hive.ql.exec.ColumnInfo; +import org.apache.hadoop.hive.ql.exec.FunctionRegistry; import org.apache.hadoop.hive.ql.exec.RowSchema; import org.apache.hadoop.hive.ql.exec.TableScanOperator; import org.apache.hadoop.hive.ql.exec.Utilities; @@ -43,6 +44,7 @@ import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.metadata.Partition; import org.apache.hadoop.hive.ql.metadata.Table; +import org.apache.hadoop.hive.ql.optimizer.stats.annotation.StatsRulesProcFactory; import org.apache.hadoop.hive.ql.parse.PrunedPartitionList; import org.apache.hadoop.hive.ql.plan.ColStatistics; import org.apache.hadoop.hive.ql.plan.ColStatistics.Range; @@ -54,6 +56,9 @@ import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc; import org.apache.hadoop.hive.ql.plan.Statistics; import org.apache.hadoop.hive.ql.plan.Statistics.State; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDFBridge; +import org.apache.hadoop.hive.ql.udf.generic.NDV; import org.apache.hadoop.hive.ql.util.JavaDataModel; import org.apache.hadoop.hive.serde.serdeConstants; import org.apache.hadoop.hive.serde2.objectinspector.ConstantObjectInspector; @@ -85,11 +90,13 @@ import org.apache.hadoop.hive.serde2.typeinfo.CharTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.VarcharTypeInfo; import org.apache.hadoop.io.BytesWritable; +import org.apache.hive.common.util.AnnotationUtils; import org.apache.tez.mapreduce.hadoop.MRJobConfig; import java.math.BigDecimal; import java.math.BigInteger; import java.util.ArrayList; +import java.util.Collections; import java.util.HashSet; import java.util.List; import java.util.Map; @@ -1247,7 +1254,7 @@ public static ColStatistics getColStatisticsFromExpression(HiveConf conf, Statis // null projection if (encd.getValue() == null) { colName = encd.getName(); - colType = "null"; + colType = serdeConstants.VOID_TYPE_NAME; numNulls = numRows; } else { colName = encd.getName(); @@ -1261,14 +1268,14 @@ public static ColStatistics getColStatisticsFromExpression(HiveConf conf, Statis ExprNodeGenericFuncDesc engfd = (ExprNodeGenericFuncDesc) end; colName = engfd.getName(); colType = engfd.getTypeString(); - countDistincts = numRows; + countDistincts = getNDVFor(engfd, numRows, parentStats); oi = engfd.getWritableObjectInspector(); } else if (end instanceof ExprNodeColumnListDesc) { // column list ExprNodeColumnListDesc encd = (ExprNodeColumnListDesc) end; colName = Joiner.on(",").join(encd.getCols()); - colType = "array"; + colType = serdeConstants.LIST_TYPE_NAME; countDistincts = numRows; oi = encd.getWritableObjectInspector(); } else if (end instanceof ExprNodeFieldDesc) { @@ -1305,6 +1312,45 @@ public static ColStatistics getColStatisticsFromExpression(HiveConf conf, Statis return colStats; } + + public static Long addWithExpDecay (List distinctVals) { + // Exponential back-off for NDVs. + // 1) Descending order sort of NDVs + // 2) denominator = NDV1 * (NDV2 ^ (1/2)) * (NDV3 ^ (1/4))) * .... + Collections.sort(distinctVals, Collections.reverseOrder()); + + long denom = distinctVals.get(0); + for (int i = 1; i < distinctVals.size(); i++) { + denom = (long) (denom * Math.pow(distinctVals.get(i), 1.0 / (1 << i))); + } + + return denom; + } + + private static long getNDVFor(ExprNodeGenericFuncDesc engfd, long numRows, Statistics parentStats) { + + GenericUDF udf = engfd.getGenericUDF(); + if (!FunctionRegistry.isDeterministic(udf)){ + return numRows; + } + List ndvs = Lists.newArrayList(); + Class udfClass = udf instanceof GenericUDFBridge ? ((GenericUDFBridge) udf).getUdfClass() : udf.getClass(); + NDV ndv = AnnotationUtils.getAnnotation(udfClass, NDV.class); + long udfNDV = Long.MAX_VALUE; + if (ndv != null) { + udfNDV = ndv.maxNdv(); + } else { + for (String col : engfd.getCols()) { + ColStatistics stats = parentStats.getColumnStatisticsFromColName(col); + if (stats != null) { + ndvs.add(stats.getCountDistint()); + } + } + } + long countDistincts = ndvs.isEmpty() ? numRows : addWithExpDecay(ndvs); + return Collections.min(Lists.newArrayList(countDistincts, udfNDV, numRows)); + } + /** * Get number of rows of a give table * @return number of rows diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFDayOfMonth.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFDayOfMonth.java index 21e6ff7..79825fc 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFDayOfMonth.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFDayOfMonth.java @@ -28,6 +28,7 @@ import org.apache.hadoop.hive.ql.exec.vector.VectorizedExpressions; import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorUDFDayOfMonthLong; import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorUDFDayOfMonthString; +import org.apache.hadoop.hive.ql.udf.generic.NDV; import org.apache.hadoop.hive.serde2.io.DateWritable; import org.apache.hadoop.hive.serde2.io.HiveIntervalDayTimeWritable; import org.apache.hadoop.hive.serde2.io.TimestampWritable; @@ -48,6 +49,7 @@ + "Example:\n " + " > SELECT _FUNC_('2009-07-30') FROM src LIMIT 1;\n" + " 30") @VectorizedExpressions({VectorUDFDayOfMonthLong.class, VectorUDFDayOfMonthString.class}) +@NDV(maxNdv = 31) public class UDFDayOfMonth extends UDF { private final SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd"); private final Calendar calendar = Calendar.getInstance(); diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFHour.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFHour.java index 835cecc..87e19ec 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFHour.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFHour.java @@ -28,6 +28,7 @@ import org.apache.hadoop.hive.ql.exec.vector.VectorizedExpressions; import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorUDFHourLong; import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorUDFHourString; +import org.apache.hadoop.hive.ql.udf.generic.NDV; import org.apache.hadoop.hive.serde2.io.HiveIntervalDayTimeWritable; import org.apache.hadoop.hive.serde2.io.TimestampWritable; import org.apache.hadoop.io.IntWritable; @@ -48,6 +49,7 @@ + " 12\n" + " > SELECT _FUNC_('12:58:59') FROM src LIMIT 1;\n" + " 12") @VectorizedExpressions({VectorUDFHourLong.class, VectorUDFHourString.class}) +@NDV(maxNdv = 24) public class UDFHour extends UDF { private final SimpleDateFormat formatter1 = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); private final SimpleDateFormat formatter2 = new SimpleDateFormat("HH:mm:ss"); diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFMinute.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFMinute.java index a9f5393..0f55266 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFMinute.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFMinute.java @@ -28,6 +28,7 @@ import org.apache.hadoop.hive.ql.exec.vector.VectorizedExpressions; import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorUDFMinuteLong; import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorUDFMinuteString; +import org.apache.hadoop.hive.ql.udf.generic.NDV; import org.apache.hadoop.hive.serde2.io.HiveIntervalDayTimeWritable; import org.apache.hadoop.hive.serde2.io.TimestampWritable; import org.apache.hadoop.io.IntWritable; @@ -48,6 +49,7 @@ + " 58\n" + " > SELECT _FUNC_('12:58:59') FROM src LIMIT 1;\n" + " 58") @VectorizedExpressions({VectorUDFMinuteLong.class, VectorUDFMinuteString.class}) +@NDV(maxNdv = 60) public class UDFMinute extends UDF { private final SimpleDateFormat formatter1 = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); private final SimpleDateFormat formatter2 = new SimpleDateFormat("HH:mm:ss"); diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFMonth.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFMonth.java index 3365804..efe5ee2 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFMonth.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFMonth.java @@ -28,6 +28,7 @@ import org.apache.hadoop.hive.ql.exec.vector.VectorizedExpressions; import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorUDFMonthLong; import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorUDFMonthString; +import org.apache.hadoop.hive.ql.udf.generic.NDV; import org.apache.hadoop.hive.serde2.io.DateWritable; import org.apache.hadoop.hive.serde2.io.HiveIntervalYearMonthWritable; import org.apache.hadoop.hive.serde2.io.TimestampWritable; @@ -48,11 +49,12 @@ + "Example:\n" + " > SELECT _FUNC_('2009-07-30') FROM src LIMIT 1;\n" + " 7") @VectorizedExpressions({VectorUDFMonthLong.class, VectorUDFMonthString.class}) +@NDV(maxNdv = 31) public class UDFMonth extends UDF { private final SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd"); private final Calendar calendar = Calendar.getInstance(); - private IntWritable result = new IntWritable(); + private final IntWritable result = new IntWritable(); public UDFMonth() { } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFSecond.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFSecond.java index e7c3d67..b724970 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFSecond.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFSecond.java @@ -29,6 +29,7 @@ import org.apache.hadoop.hive.ql.exec.vector.VectorizedExpressions; import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorUDFSecondLong; import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorUDFSecondString; +import org.apache.hadoop.hive.ql.udf.generic.NDV; import org.apache.hadoop.hive.serde2.io.DoubleWritable; import org.apache.hadoop.hive.serde2.io.HiveIntervalDayTimeWritable; import org.apache.hadoop.hive.serde2.io.TimestampWritable; @@ -51,6 +52,7 @@ + " 59\n" + " > SELECT _FUNC_('12:58:59') FROM src LIMIT 1;\n" + " 59") @VectorizedExpressions({VectorUDFSecondLong.class, VectorUDFSecondString.class}) +@NDV(maxNdv = 60) public class UDFSecond extends UDF { private final SimpleDateFormat formatter1 = new SimpleDateFormat( "yyyy-MM-dd HH:mm:ss"); diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFWeekOfYear.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFWeekOfYear.java index f076d1d..42ee1bf 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFWeekOfYear.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFWeekOfYear.java @@ -28,6 +28,7 @@ import org.apache.hadoop.hive.ql.exec.vector.VectorizedExpressions; import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorUDFWeekOfYearLong; import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorUDFWeekOfYearString; +import org.apache.hadoop.hive.ql.udf.generic.NDV; import org.apache.hadoop.hive.serde2.io.DateWritable; import org.apache.hadoop.hive.serde2.io.TimestampWritable; import org.apache.hadoop.io.IntWritable; @@ -45,6 +46,7 @@ + " 8\n" + " > SELECT _FUNC_('1980-12-31 12:59:59') FROM src LIMIT 1;\n" + " 1") @VectorizedExpressions({VectorUDFWeekOfYearLong.class, VectorUDFWeekOfYearString.class}) +@NDV(maxNdv = 52) public class UDFWeekOfYear extends UDF { private final SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd"); private final Calendar calendar = Calendar.getInstance(); diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFYear.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFYear.java index 34b0c47..de46104 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFYear.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFYear.java @@ -28,6 +28,7 @@ import org.apache.hadoop.hive.ql.exec.vector.VectorizedExpressions; import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorUDFYearLong; import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorUDFYearString; +import org.apache.hadoop.hive.ql.udf.generic.NDV; import org.apache.hadoop.hive.serde2.io.DateWritable; import org.apache.hadoop.hive.serde2.io.HiveIntervalYearMonthWritable; import org.apache.hadoop.hive.serde2.io.TimestampWritable; @@ -48,6 +49,7 @@ + "Example:\n " + " > SELECT _FUNC_('2009-07-30') FROM src LIMIT 1;\n" + " 2009") @VectorizedExpressions({VectorUDFYearLong.class, VectorUDFYearString.class}) +@NDV(maxNdv = 20) // although technically its unbounded, its unlikely we will ever see ndv > 20 public class UDFYear extends UDF { private final SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd"); private final Calendar calendar = Calendar.getInstance(); diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFAddMonths.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFAddMonths.java index 82e5811..dd88473 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFAddMonths.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFAddMonths.java @@ -49,6 +49,7 @@ + " 'yyyy-MM-dd'. num_months is a number. The time part of start_date is " + "ignored.\n" + "Example:\n " + " > SELECT _FUNC_('2009-08-31', 1) FROM src LIMIT 1;\n" + " '2009-09-30'") +@NDV(maxNdv = 250) // 250 seems to be reasonable upper limit for this public class GenericUDFAddMonths extends GenericUDF { private transient Converter[] converters = new Converter[2]; private transient PrimitiveCategory[] inputTypes = new PrimitiveCategory[2]; diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFArrayContains.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFArrayContains.java index 510f367..c031c61 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFArrayContains.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFArrayContains.java @@ -37,6 +37,7 @@ extended="Example:\n" + " > SELECT _FUNC_(array(1, 2, 3), 2) FROM src LIMIT 1;\n" + " true") +@NDV(maxNdv = 2) public class GenericUDFArrayContains extends GenericUDF { private static final int ARRAY_IDX = 0; diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFBetween.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFBetween.java index 9d9ee57..04f72a6 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFBetween.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFBetween.java @@ -28,6 +28,7 @@ import org.apache.hadoop.io.BooleanWritable; @Description(name = "between", value = "_FUNC_ a [NOT] BETWEEN b AND c - evaluate if a is [not] in between b and c") +@NDV(maxNdv = 2) public class GenericUDFBetween extends GenericUDF { GenericUDFOPEqualOrGreaterThan egt = new GenericUDFOPEqualOrGreaterThan(); diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFCurrentDate.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFCurrentDate.java index 67f3c64..1f027a2 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFCurrentDate.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFCurrentDate.java @@ -18,6 +18,7 @@ package org.apache.hadoop.hive.ql.udf.generic; import java.sql.Date; + import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDFArgumentException; import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException; @@ -34,6 +35,7 @@ @Description(name = "current_date", value = "_FUNC_() - Returns the current date at the start of query evaluation." + " All calls of current_date within the same query return the same value.") +@NDV(maxNdv = 1) public class GenericUDFCurrentDate extends GenericUDF { protected DateWritable currentDate; diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFCurrentTimestamp.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFCurrentTimestamp.java index cc7d0d4..2f13a22 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFCurrentTimestamp.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFCurrentTimestamp.java @@ -33,6 +33,7 @@ @Description(name = "current_timestamp", value = "_FUNC_() - Returns the current timestamp at the start of query evaluation." + " All calls of current_timestamp within the same query return the same value.") +@NDV(maxNdv = 1) public class GenericUDFCurrentTimestamp extends GenericUDF { protected TimestampWritable currentTimestamp; diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFCurrentUser.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFCurrentUser.java index 4a1514b..d97583d 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFCurrentUser.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFCurrentUser.java @@ -30,6 +30,7 @@ @UDFType(deterministic = true) @Description(name = "current_user", value = "_FUNC_() - Returns current user name", extended = "SessionState UserFromAuthenticator") +@NDV(maxNdv = 1) public class GenericUDFCurrentUser extends GenericUDF { protected Text currentUser; diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPAnd.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPAnd.java index eb33d98..fa0cda8 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPAnd.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPAnd.java @@ -38,6 +38,7 @@ @Description(name = "and", value = "a1 _FUNC_ a2 _FUNC_ ... _FUNC_ an - Logical and") @VectorizedExpressions({ColAndCol.class, FilterExprAndExpr.class, FilterColAndScalar.class, FilterScalarAndColumn.class}) +@NDV(maxNdv = 2) public class GenericUDFOPAnd extends GenericUDF { private final BooleanWritable result = new BooleanWritable(); private transient BooleanObjectInspector boi[]; diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPEqual.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPEqual.java index 27537bf..e82627d 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPEqual.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPEqual.java @@ -66,6 +66,7 @@ DateColEqualDateScalar.class,FilterDateColEqualDateScalar.class, DateScalarEqualDateColumn.class,FilterDateScalarEqualDateColumn.class, }) +@NDV(maxNdv = 2) public class GenericUDFOPEqual extends GenericUDFBaseCompare { public GenericUDFOPEqual(){ this.opName = "EQUAL"; diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPEqualNS.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPEqualNS.java index d0b35a7..3707a33 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPEqualNS.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPEqualNS.java @@ -23,6 +23,7 @@ @Description(name = "<=>", value = "a _FUNC_ b - Returns same result with EQUAL(=) operator " + "for non-null operands, but returns TRUE if both are NULL, FALSE if one of the them is NULL") +@NDV(maxNdv = 2) public class GenericUDFOPEqualNS extends GenericUDFOPEqual { @Override diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPEqualOrGreaterThan.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPEqualOrGreaterThan.java index 90d98bb..bfd71c7 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPEqualOrGreaterThan.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPEqualOrGreaterThan.java @@ -67,6 +67,7 @@ DateColGreaterEqualDateScalar.class,FilterDateColGreaterEqualDateScalar.class, DateScalarGreaterEqualDateColumn.class,FilterDateScalarGreaterEqualDateColumn.class, }) +@NDV(maxNdv = 2) public class GenericUDFOPEqualOrGreaterThan extends GenericUDFBaseCompare { public GenericUDFOPEqualOrGreaterThan(){ this.opName = "EQUAL OR GREATER THAN"; diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPEqualOrLessThan.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPEqualOrLessThan.java index 35133d4..1e69ee6 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPEqualOrLessThan.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPEqualOrLessThan.java @@ -67,6 +67,7 @@ DateColLessEqualDateScalar.class,FilterDateColLessEqualDateScalar.class, DateScalarLessEqualDateColumn.class,FilterDateScalarLessEqualDateColumn.class, }) +@NDV(maxNdv = 2) public class GenericUDFOPEqualOrLessThan extends GenericUDFBaseCompare { public GenericUDFOPEqualOrLessThan(){ this.opName = "EQUAL OR LESS THAN"; diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPGreaterThan.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPGreaterThan.java index be05b4e..bba4d97 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPGreaterThan.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPGreaterThan.java @@ -67,6 +67,7 @@ DateColGreaterDateScalar.class,FilterDateColGreaterDateScalar.class, DateScalarGreaterDateColumn.class,FilterDateScalarGreaterDateColumn.class, }) +@NDV(maxNdv = 2) public class GenericUDFOPGreaterThan extends GenericUDFBaseCompare { public GenericUDFOPGreaterThan(){ this.opName = "GREATER THAN"; diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPLessThan.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPLessThan.java index 9d72f9e..b992fe6 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPLessThan.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPLessThan.java @@ -67,6 +67,7 @@ DateColLessDateScalar.class,FilterDateColLessDateScalar.class, DateScalarLessDateColumn.class,FilterDateScalarLessDateColumn.class, }) +@NDV(maxNdv = 2) public class GenericUDFOPLessThan extends GenericUDFBaseCompare { public GenericUDFOPLessThan(){ this.opName = "LESS THAN"; diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPNot.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPNot.java index ea8fa71..2d1b013 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPNot.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPNot.java @@ -36,6 +36,7 @@ */ @Description(name = "not", value = "_FUNC_ a - Logical not") @VectorizedExpressions({NotCol.class, SelectColumnIsFalse.class}) +@NDV(maxNdv = 2) public class GenericUDFOPNot extends GenericUDF { private final BooleanWritable result = new BooleanWritable(); private transient BooleanObjectInspector boi; diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPNotEqual.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPNotEqual.java index 7023225..ad47681 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPNotEqual.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPNotEqual.java @@ -41,7 +41,7 @@ StringGroupColNotEqualStringScalar.class, StringGroupColNotEqualVarCharScalar.class, StringGroupColNotEqualCharScalar.class, StringScalarNotEqualStringGroupColumn.class, - VarCharScalarNotEqualStringGroupColumn.class, CharScalarNotEqualStringGroupColumn.class, + VarCharScalarNotEqualStringGroupColumn.class, CharScalarNotEqualStringGroupColumn.class, FilterStringGroupColNotEqualStringScalar.class, FilterStringScalarNotEqualStringGroupColumn.class, FilterStringGroupColNotEqualVarCharScalar.class, FilterVarCharScalarNotEqualStringGroupColumn.class, FilterStringGroupColNotEqualCharScalar.class, FilterCharScalarNotEqualStringGroupColumn.class, @@ -66,6 +66,7 @@ DateColNotEqualDateScalar.class,FilterDateColNotEqualDateScalar.class, DateScalarNotEqualDateColumn.class,FilterDateScalarNotEqualDateColumn.class, }) +@NDV(maxNdv = 2) public class GenericUDFOPNotEqual extends GenericUDFBaseCompare { public GenericUDFOPNotEqual(){ this.opName = "NOT EQUAL"; diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPNotNull.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPNotNull.java index 2b67c38..e208d59 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPNotNull.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPNotNull.java @@ -36,6 +36,7 @@ @Description(name = "isnotnull", value = "_FUNC_ a - Returns true if a is not NULL and false otherwise") @VectorizedExpressions({IsNotNull.class, SelectColumnIsNotNull.class}) +@NDV(maxNdv = 2) public class GenericUDFOPNotNull extends GenericUDF { private final BooleanWritable result = new BooleanWritable(); diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPNull.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPNull.java index 4eb92eb..8c4b478 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPNull.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPNull.java @@ -35,6 +35,7 @@ */ @Description(name = "isnull", value = "_FUNC_ a - Returns true if a is NULL and false otherwise") @VectorizedExpressions({IsNull.class, SelectColumnIsNull.class}) +@NDV(maxNdv = 2) public class GenericUDFOPNull extends GenericUDF { private final BooleanWritable result = new BooleanWritable(); diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPOr.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPOr.java index 8de59c1..af38c97 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPOr.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPOr.java @@ -38,6 +38,7 @@ @Description(name = "or", value = "a1 _FUNC_ a2 _FUNC_ ... _FUNC_ an - Logical or") @VectorizedExpressions({ColOrCol.class, FilterExprOrExpr.class, FilterColOrScalar.class, FilterScalarOrColumn.class}) +@NDV(maxNdv = 2) public class GenericUDFOPOr extends GenericUDF { private final BooleanWritable result = new BooleanWritable(); private transient BooleanObjectInspector[] boi; diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/NDV.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/NDV.java new file mode 100644 index 0000000..307135b --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/NDV.java @@ -0,0 +1,27 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.udf.generic; + +import java.lang.annotation.Retention; +import java.lang.annotation.RetentionPolicy; + +@Retention(RetentionPolicy.RUNTIME) +public @interface NDV { + + long maxNdv(); +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/UDFCurrentDB.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/UDFCurrentDB.java index 5f484cf..a5bab4f 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/UDFCurrentDB.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/UDFCurrentDB.java @@ -31,6 +31,7 @@ // deterministic in the query range @Description(name = "current_database", value = "_FUNC_() - returns currently using database name") +@NDV(maxNdv = 1) public class UDFCurrentDB extends GenericUDF { private MapredContext context; diff --git a/ql/src/test/results/clientpositive/cbo_rp_auto_join0.q.out b/ql/src/test/results/clientpositive/cbo_rp_auto_join0.q.out index 7822ad9..878175f 100644 --- a/ql/src/test/results/clientpositive/cbo_rp_auto_join0.q.out +++ b/ql/src/test/results/clientpositive/cbo_rp_auto_join0.q.out @@ -71,22 +71,22 @@ STAGE PLANS: 0 1 outputColumnNames: _col0, _col1, _col5, _col6 - Statistics: Num rows: 18 Data size: 6120 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 36 Data size: 12240 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: _col0 (type: string), _col1 (type: string), _col5 (type: string), _col6 (type: string) outputColumnNames: _col0, _col1, _col2, _col3 - Statistics: Num rows: 18 Data size: 6120 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 36 Data size: 12240 Basic stats: COMPLETE Column stats: COMPLETE Reduce Output Operator key expressions: _col0 (type: string), _col1 (type: string), _col2 (type: string), _col3 (type: string) sort order: ++++ - Statistics: Num rows: 18 Data size: 6120 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 36 Data size: 12240 Basic stats: COMPLETE Column stats: COMPLETE Local Work: Map Reduce Local Work Reduce Operator Tree: Select Operator expressions: KEY.reducesinkkey0 (type: string), KEY.reducesinkkey1 (type: string), KEY.reducesinkkey2 (type: string), KEY.reducesinkkey3 (type: string) outputColumnNames: _col0, _col1, _col2, _col3 - Statistics: Num rows: 18 Data size: 6120 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 36 Data size: 12240 Basic stats: COMPLETE Column stats: COMPLETE Group By Operator aggregations: sum(hash(_col0,_col1,_col2,_col3)) mode: hash @@ -204,22 +204,22 @@ STAGE PLANS: 0 1 outputColumnNames: _col0, _col1, _col5, _col6 - Statistics: Num rows: 18 Data size: 6120 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 36 Data size: 12240 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: _col0 (type: string), _col1 (type: string), _col5 (type: string), _col6 (type: string) outputColumnNames: _col0, _col1, _col2, _col3 - Statistics: Num rows: 18 Data size: 6120 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 36 Data size: 12240 Basic stats: COMPLETE Column stats: COMPLETE Reduce Output Operator key expressions: _col0 (type: string), _col1 (type: string), _col2 (type: string), _col3 (type: string) sort order: ++++ - Statistics: Num rows: 18 Data size: 6120 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 36 Data size: 12240 Basic stats: COMPLETE Column stats: COMPLETE Local Work: Map Reduce Local Work Reduce Operator Tree: Select Operator expressions: KEY.reducesinkkey0 (type: string), KEY.reducesinkkey1 (type: string), KEY.reducesinkkey2 (type: string), KEY.reducesinkkey3 (type: string) outputColumnNames: _col0, _col1, _col2, _col3 - Statistics: Num rows: 18 Data size: 6120 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 36 Data size: 12240 Basic stats: COMPLETE Column stats: COMPLETE Group By Operator aggregations: sum(hash(_col0,_col1,_col2,_col3)) mode: hash diff --git a/ql/src/test/results/clientpositive/tez/explainuser_1.q.out b/ql/src/test/results/clientpositive/tez/explainuser_1.q.out index a3d1f87..2b7eac6 100644 --- a/ql/src/test/results/clientpositive/tez/explainuser_1.q.out +++ b/ql/src/test/results/clientpositive/tez/explainuser_1.q.out @@ -2954,11 +2954,11 @@ Stage-0 Statistics:Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE Select Operator [SEL_10] outputColumnNames:["_col0"] - Statistics:Num rows: 200 Data size: 0 Basic stats: PARTIAL Column stats: COMPLETE + Statistics:Num rows: 400 Data size: 0 Basic stats: PARTIAL Column stats: COMPLETE Merge Join Operator [MERGEJOIN_19] | condition map:[{"":"Inner Join 0 to 1"}] | keys:{} - | Statistics:Num rows: 200 Data size: 0 Basic stats: PARTIAL Column stats: COMPLETE + | Statistics:Num rows: 400 Data size: 0 Basic stats: PARTIAL Column stats: COMPLETE |<-Map 1 [SIMPLE_EDGE] | Reduce Output Operator [RS_7] | sort order: @@ -5175,13 +5175,13 @@ Stage-0 Reducer 2 File Output Operator [FS_9] compressed:true - Statistics:Num rows: 125000 Data size: 10875000 Basic stats: COMPLETE Column stats: COMPLETE + Statistics:Num rows: 250000 Data size: 21750000 Basic stats: COMPLETE Column stats: COMPLETE table:{"input format:":"org.apache.hadoop.mapred.TextInputFormat","output format:":"org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat","serde:":"org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe"} Merge Join Operator [MERGEJOIN_11] | condition map:[{"":"Inner Join 0 to 1"}] | keys:{} | outputColumnNames:["_col0"] - | Statistics:Num rows: 125000 Data size: 10875000 Basic stats: COMPLETE Column stats: COMPLETE + | Statistics:Num rows: 250000 Data size: 21750000 Basic stats: COMPLETE Column stats: COMPLETE |<-Map 1 [SIMPLE_EDGE] | Reduce Output Operator [RS_5] | sort order: @@ -5417,21 +5417,21 @@ Stage-0 Reducer 3 File Output Operator [FS_12] compressed:true - Statistics:Num rows: 13778 Data size: 4904968 Basic stats: COMPLETE Column stats: COMPLETE + Statistics:Num rows: 27556 Data size: 9809936 Basic stats: COMPLETE Column stats: COMPLETE table:{"input format:":"org.apache.hadoop.mapred.TextInputFormat","output format:":"org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat","serde:":"org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe"} Select Operator [SEL_11] | outputColumnNames:["_col0","_col1","_col2","_col3"] - | Statistics:Num rows: 13778 Data size: 4904968 Basic stats: COMPLETE Column stats: COMPLETE + | Statistics:Num rows: 27556 Data size: 9809936 Basic stats: COMPLETE Column stats: COMPLETE |<-Reducer 2 [SIMPLE_EDGE] Reduce Output Operator [RS_10] key expressions:_col0 (type: string), _col1 (type: string), _col2 (type: string), _col3 (type: string) sort order:++++ - Statistics:Num rows: 13778 Data size: 4904968 Basic stats: COMPLETE Column stats: COMPLETE + Statistics:Num rows: 27556 Data size: 9809936 Basic stats: COMPLETE Column stats: COMPLETE Merge Join Operator [MERGEJOIN_15] | condition map:[{"":"Inner Join 0 to 1"}] | keys:{} | outputColumnNames:["_col0","_col1","_col2","_col3"] - | Statistics:Num rows: 13778 Data size: 4904968 Basic stats: COMPLETE Column stats: COMPLETE + | Statistics:Num rows: 27556 Data size: 9809936 Basic stats: COMPLETE Column stats: COMPLETE |<-Map 1 [SIMPLE_EDGE] | Reduce Output Operator [RS_6] | sort order: -- 1.7.12.4 (Apple Git-37)