diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java index a8ff158..c1e314f 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java @@ -967,20 +967,6 @@ private boolean checkMapSideAggregation(GroupByOperator gop, // worst-case, hash aggregation disabled return false; } - - private long applyGBYRule(long numRows, long dvProd) { - long newNumRows = numRows; - - // to avoid divide by 2 to become 0 - if (numRows > 1) { - if (dvProd != 0) { - newNumRows = Math.min(numRows / 2, dvProd); - } else { - newNumRows = numRows / 2; - } - } - return newNumRows; - } } /** @@ -1032,170 +1018,156 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, int numAttr = 1; AnnotateStatsProcCtx aspCtx = (AnnotateStatsProcCtx) procCtx; HiveConf conf = aspCtx.getConf(); - boolean allStatsAvail = true; boolean allSatisfyPreCondition = true; for (Operator op : parents) { if (op.getStatistics() == null) { - allStatsAvail = false; + return null; } } - if (allStatsAvail) { - - for (Operator op : parents) { - if (!satisfyPrecondition(op.getStatistics())) { - allSatisfyPreCondition = false; - } + for (Operator op : parents) { + if (!satisfyPrecondition(op.getStatistics())) { + allSatisfyPreCondition = false; + break; } + } - if (allSatisfyPreCondition) { - - // statistics object that is combination of statistics from all - // relations involved in JOIN - Statistics stats = new Statistics(); - List distinctVals = Lists.newArrayList(); - int numParent = parents.size(); - Map rowCountParents = Maps.newHashMap(); - Map joinStats = Maps.newHashMap(); - Map> joinKeys = Maps.newHashMap(); - List rowCounts = Lists.newArrayList(); - - // detect if there are multiple attributes in join key - ReduceSinkOperator rsOp = (ReduceSinkOperator) jop.getParentOperators().get(0); - List keyExprs = StatsUtils.getQualifedReducerKeyNames(rsOp.getConf() + if (allSatisfyPreCondition) { + + // statistics object that is combination of statistics from all + // relations involved in JOIN + Statistics stats = new Statistics(); + int numParent = parents.size(); + Map rowCountParents = Maps.newHashMap(); + Map joinStats = Maps.newHashMap(); + Map> joinKeys = Maps.newHashMap(); + List rowCounts = Lists.newArrayList(); + + // detect if there are multiple attributes in join key + ReduceSinkOperator rsOp = (ReduceSinkOperator) jop.getParentOperators().get(0); + List keyExprs = StatsUtils.getQualifedReducerKeyNames(rsOp.getConf() + .getOutputKeyColumnNames()); + numAttr = keyExprs.size(); + + // infer PK-FK relationship in single attribute join case + long inferredRowCount = inferPKFKRelationship(numAttr, parents, jop); + // get the join keys from parent ReduceSink operators + for (int pos = 0; pos < parents.size(); pos++) { + ReduceSinkOperator parent = (ReduceSinkOperator) jop.getParentOperators().get(pos); + Statistics parentStats = parent.getStatistics(); + keyExprs = StatsUtils.getQualifedReducerKeyNames(parent.getConf() .getOutputKeyColumnNames()); - numAttr = keyExprs.size(); - // infer PK-FK relationship in single attribute join case - long inferredRowCount = inferPKFKRelationship(numAttr, parents, jop); - // get the join keys from parent ReduceSink operators - for (int pos = 0; pos < parents.size(); pos++) { - ReduceSinkOperator parent = (ReduceSinkOperator) jop.getParentOperators().get(pos); - Statistics parentStats = parent.getStatistics(); - keyExprs = StatsUtils.getQualifedReducerKeyNames(parent.getConf() - .getOutputKeyColumnNames()); - - rowCountParents.put(pos, parentStats.getNumRows()); - rowCounts.add(parentStats.getNumRows()); + rowCountParents.put(pos, parentStats.getNumRows()); + rowCounts.add(parentStats.getNumRows()); - // internal name for expressions and estimate column statistics for expression. - joinKeys.put(pos, keyExprs); + // internal name for expressions and estimate column statistics for expression. + joinKeys.put(pos, keyExprs); - // get column statistics for all output columns - joinStats.put(pos, parentStats); + // get column statistics for all output columns + joinStats.put(pos, parentStats); - // since new statistics is derived from all relations involved in - // JOIN, we need to update the state information accordingly - stats.updateColumnStatsState(parentStats.getColumnStatsState()); - } + // since new statistics is derived from all relations involved in + // JOIN, we need to update the state information accordingly + stats.updateColumnStatsState(parentStats.getColumnStatsState()); + } - // compute denominator i.e, max(V(R,Y), V(S,Y)) in case of single - // attribute join, else max(V(R,y1), V(S,y1)) * max(V(R,y2), V(S,y2)) + List distinctVals = Lists.newArrayList(); + long denom = 1; + if (inferredRowCount == -1) { + // failed to infer PK-FK relationship for row count estimation fall-back on default logic + // compute denominator max(V(R,y1), V(S,y1)) * max(V(R,y2), V(S,y2)) // in case of multi-attribute join - long denom = 1; - if (numAttr > 1) { - List perAttrDVs = Lists.newArrayList(); - for (int idx = 0; idx < numAttr; idx++) { - for (Integer i : joinKeys.keySet()) { - String col = joinKeys.get(i).get(idx); - ColStatistics cs = joinStats.get(i).getColumnStatisticsFromColName(col); - if (cs != null) { - perAttrDVs.add(cs.getCountDistint()); - } + List perAttrDVs = Lists.newArrayList(); + for (int idx = 0; idx < numAttr; idx++) { + for (Integer i : joinKeys.keySet()) { + String col = joinKeys.get(i).get(idx); + ColStatistics cs = joinStats.get(i).getColumnStatisticsFromColName(col); + if (cs != null) { + perAttrDVs.add(cs.getCountDistint()); } - distinctVals.add(getDenominator(perAttrDVs)); - perAttrDVs.clear(); } + distinctVals.add(getDenominator(perAttrDVs)); + perAttrDVs.clear(); + } - if (numAttr > numParent) { - // To avoid denominator getting larger and aggressively reducing - // number of rows, we will ease out denominator. - denom = getEasedOutDenominator(distinctVals); - } else { - for (Long l : distinctVals) { - denom = StatsUtils.safeMult(denom, l); - } - } + if (numAttr > numParent) { + // To avoid denominator getting larger and aggressively reducing + // number of rows, we will ease out denominator. + denom = StatsUtils.addWithExpDecay(distinctVals); } else { - if (numAttr == 1) { - for (Integer i : joinKeys.keySet()) { - String col = joinKeys.get(i).get(0); - ColStatistics cs = joinStats.get(i).getColumnStatisticsFromColName(col); - if (cs != null) { - distinctVals.add(cs.getCountDistint()); - } - } + for (Long l : distinctVals) { + denom = StatsUtils.safeMult(denom, l); } - denom = getDenominator(distinctVals); } + } - // Update NDV of joined columns to be min(V(R,y), V(S,y)) - updateJoinColumnsNDV(joinKeys, joinStats, numAttr); - - // column statistics from different sources are put together and - // rename based on output schema of join operator - Map colExprMap = jop.getColumnExprMap(); - RowSchema rs = jop.getSchema(); - List outColStats = Lists.newArrayList(); - for (ColumnInfo ci : rs.getSignature()) { - String key = ci.getInternalName(); - ExprNodeDesc end = colExprMap.get(key); - if (end instanceof ExprNodeColumnDesc) { - String colName = ((ExprNodeColumnDesc) end).getColumn(); - int pos = jop.getConf().getReversedExprs().get(key); - ColStatistics cs = joinStats.get(pos).getColumnStatisticsFromColName(colName); - String outColName = key; - if (cs != null) { - cs.setColumnName(outColName); - } - outColStats.add(cs); + // Update NDV of joined columns to be min(V(R,y), V(S,y)) + updateJoinColumnsNDV(joinKeys, joinStats, numAttr); + + // column statistics from different sources are put together and + // rename based on output schema of join operator + Map colExprMap = jop.getColumnExprMap(); + RowSchema rs = jop.getSchema(); + List outColStats = Lists.newArrayList(); + for (ColumnInfo ci : rs.getSignature()) { + String key = ci.getInternalName(); + ExprNodeDesc end = colExprMap.get(key); + if (end instanceof ExprNodeColumnDesc) { + String colName = ((ExprNodeColumnDesc) end).getColumn(); + int pos = jop.getConf().getReversedExprs().get(key); + ColStatistics cs = joinStats.get(pos).getColumnStatisticsFromColName(colName); + String outColName = key; + if (cs != null) { + cs.setColumnName(outColName); } + outColStats.add(cs); } + } - // update join statistics - stats.setColumnStats(outColStats); - long newRowCount = inferredRowCount !=-1 ? inferredRowCount : computeNewRowCount(rowCounts, denom); - updateStatsForJoinType(stats, newRowCount, jop, rowCountParents); - jop.setStatistics(stats); + // update join statistics + stats.setColumnStats(outColStats); + long newRowCount = inferredRowCount !=-1 ? inferredRowCount : computeNewRowCount(rowCounts, denom); + updateStatsForJoinType(stats, newRowCount, jop, rowCountParents); + jop.setStatistics(stats); - if (isDebugEnabled) { - LOG.debug("[0] STATS-" + jop.toString() + ": " + stats.extendedToString()); - } - } else { + if (isDebugEnabled) { + LOG.debug("[0] STATS-" + jop.toString() + ": " + stats.extendedToString()); + } + } else { - // worst case when there are no column statistics - float joinFactor = HiveConf.getFloatVar(conf, HiveConf.ConfVars.HIVE_STATS_JOIN_FACTOR); - int numParents = parents.size(); - List parentRows = Lists.newArrayList(); - List parentSizes = Lists.newArrayList(); - int maxRowIdx = 0; - long maxRowCount = 0; - int idx = 0; - - for (Operator op : parents) { - Statistics ps = op.getStatistics(); - long rowCount = ps.getNumRows(); - if (rowCount > maxRowCount) { - maxRowCount = rowCount; - maxRowIdx = idx; - } - parentRows.add(rowCount); - parentSizes.add(ps.getDataSize()); - idx++; + // worst case when there are no column statistics + float joinFactor = HiveConf.getFloatVar(conf, HiveConf.ConfVars.HIVE_STATS_JOIN_FACTOR); + int numParents = parents.size(); + List parentRows = Lists.newArrayList(); + List parentSizes = Lists.newArrayList(); + int maxRowIdx = 0; + long maxRowCount = 0; + int idx = 0; + + for (Operator op : parents) { + Statistics ps = op.getStatistics(); + long rowCount = ps.getNumRows(); + if (rowCount > maxRowCount) { + maxRowCount = rowCount; + maxRowIdx = idx; } + parentRows.add(rowCount); + parentSizes.add(ps.getDataSize()); + idx++; + } - long maxDataSize = parentSizes.get(maxRowIdx); - newNumRows = StatsUtils.safeMult(StatsUtils.safeMult(maxRowCount, (numParents - 1)), joinFactor); - long newDataSize = StatsUtils.safeMult(StatsUtils.safeMult(maxDataSize, (numParents - 1)), joinFactor); - Statistics wcStats = new Statistics(); - wcStats.setNumRows(newNumRows); - wcStats.setDataSize(newDataSize); - jop.setStatistics(wcStats); + long maxDataSize = parentSizes.get(maxRowIdx); + newNumRows = StatsUtils.safeMult(StatsUtils.safeMult(maxRowCount, (numParents - 1)), joinFactor); + long newDataSize = StatsUtils.safeMult(StatsUtils.safeMult(maxDataSize, (numParents - 1)), joinFactor); + Statistics wcStats = new Statistics(); + wcStats.setNumRows(newNumRows); + wcStats.setDataSize(newDataSize); + jop.setStatistics(wcStats); - if (isDebugEnabled) { - LOG.debug("[1] STATS-" + jop.toString() + ": " + wcStats.extendedToString()); - } + if (isDebugEnabled) { + LOG.debug("[1] STATS-" + jop.toString() + ": " + wcStats.extendedToString()); } } return null; @@ -1204,44 +1176,46 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, private long inferPKFKRelationship(int numAttr, List> parents, CommonJoinOperator jop) { long newNumRows = -1; - if (numAttr == 1) { - // If numAttr is 1, this means we join on one single key column. - Map parentsWithPK = getPrimaryKeyCandidates(parents); - - // We only allow one single PK. - if (parentsWithPK.size() != 1) { - LOG.debug("STATS-" + jop.toString() + ": detects none/multiple PK parents."); - return newNumRows; - } - Integer pkPos = parentsWithPK.keySet().iterator().next(); - ColStatistics csPK = parentsWithPK.values().iterator().next(); + if (numAttr != 1) { + return newNumRows; + } - // infer foreign key candidates positions - Map csFKs = getForeignKeyCandidates(parents, csPK); + // If numAttr is 1, this means we join on one single key column. + Map parentsWithPK = getPrimaryKeyCandidates(parents); - // we allow multiple foreign keys (snowflake schema) - // csfKs.size() + 1 == parents.size() means we have a single PK and all - // the rest ops are FKs. - if (csFKs.size() + 1 == parents.size()) { - newNumRows = getCardinality(parents, pkPos, csPK, csFKs, jop); + // We only allow one single PK. + if (parentsWithPK.size() != 1) { + LOG.debug("STATS-" + jop.toString() + ": detects none/multiple PK parents."); + return newNumRows; + } + Integer pkPos = parentsWithPK.keySet().iterator().next(); + ColStatistics csPK = parentsWithPK.values().iterator().next(); - // some debug information - if (isDebugEnabled) { - List parentIds = Lists.newArrayList(); + // infer foreign key candidates positions + Map csFKs = getForeignKeyCandidates(parents, csPK); - // print primary key containing parents - for (Integer i : parentsWithPK.keySet()) { - parentIds.add(parents.get(i).toString()); - } - LOG.debug("STATS-" + jop.toString() + ": PK parent id(s) - " + parentIds); - parentIds.clear(); + // we allow multiple foreign keys (snowflake schema) + // csfKs.size() + 1 == parents.size() means we have a single PK and all + // the rest ops are FKs. + if (csFKs.size() + 1 == parents.size()) { + newNumRows = getCardinality(parents, pkPos, csPK, csFKs, jop); - // print foreign key containing parents - for (Integer i : csFKs.keySet()) { - parentIds.add(parents.get(i).toString()); - } - LOG.debug("STATS-" + jop.toString() + ": FK parent id(s) - " + parentIds); + // some debug information + if (isDebugEnabled) { + List parentIds = Lists.newArrayList(); + + // print primary key containing parents + for (Integer i : parentsWithPK.keySet()) { + parentIds.add(parents.get(i).toString()); } + LOG.debug("STATS-" + jop.toString() + ": PK parent id(s) - " + parentIds); + parentIds.clear(); + + // print foreign key containing parents + for (Integer i : csFKs.keySet()) { + parentIds.add(parents.get(i).toString()); + } + LOG.debug("STATS-" + jop.toString() + ": FK parent id(s) - " + parentIds); } } return newNumRows; @@ -1425,20 +1399,6 @@ private float getSelectivityComplexTree(Operator op) { return result; } - private Long getEasedOutDenominator(List distinctVals) { - // Exponential back-off for NDVs. - // 1) Descending order sort of NDVs - // 2) denominator = NDV1 * (NDV2 ^ (1/2)) * (NDV3 ^ (1/4))) * .... - Collections.sort(distinctVals, Collections.reverseOrder()); - - long denom = distinctVals.get(0); - for (int i = 1; i < distinctVals.size(); i++) { - denom = (long) (denom * Math.pow(distinctVals.get(i), 1.0 / (1 << i))); - } - - return denom; - } - private void updateStatsForJoinType(Statistics stats, long newNumRows, CommonJoinOperator jop, Map rowCountParents) { diff --git a/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java b/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java index 71ed31c..4bf6a97 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java @@ -36,6 +36,7 @@ import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj; import org.apache.hadoop.hive.metastore.api.Decimal; import org.apache.hadoop.hive.ql.exec.ColumnInfo; +import org.apache.hadoop.hive.ql.exec.FunctionRegistry; import org.apache.hadoop.hive.ql.exec.RowSchema; import org.apache.hadoop.hive.ql.exec.TableScanOperator; import org.apache.hadoop.hive.ql.exec.Utilities; @@ -43,6 +44,7 @@ import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.metadata.Partition; import org.apache.hadoop.hive.ql.metadata.Table; +import org.apache.hadoop.hive.ql.optimizer.stats.annotation.StatsRulesProcFactory; import org.apache.hadoop.hive.ql.parse.PrunedPartitionList; import org.apache.hadoop.hive.ql.plan.ColStatistics; import org.apache.hadoop.hive.ql.plan.ColStatistics.Range; @@ -54,6 +56,9 @@ import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc; import org.apache.hadoop.hive.ql.plan.Statistics; import org.apache.hadoop.hive.ql.plan.Statistics.State; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDFBridge; +import org.apache.hadoop.hive.ql.udf.generic.NDV; import org.apache.hadoop.hive.ql.util.JavaDataModel; import org.apache.hadoop.hive.serde.serdeConstants; import org.apache.hadoop.hive.serde2.objectinspector.ConstantObjectInspector; @@ -85,11 +90,13 @@ import org.apache.hadoop.hive.serde2.typeinfo.CharTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.VarcharTypeInfo; import org.apache.hadoop.io.BytesWritable; +import org.apache.hive.common.util.AnnotationUtils; import org.apache.tez.mapreduce.hadoop.MRJobConfig; import java.math.BigDecimal; import java.math.BigInteger; import java.util.ArrayList; +import java.util.Collections; import java.util.HashSet; import java.util.List; import java.util.Map; @@ -1247,7 +1254,7 @@ public static ColStatistics getColStatisticsFromExpression(HiveConf conf, Statis // null projection if (encd.getValue() == null) { colName = encd.getName(); - colType = "null"; + colType = serdeConstants.VOID_TYPE_NAME; numNulls = numRows; } else { colName = encd.getName(); @@ -1261,14 +1268,14 @@ public static ColStatistics getColStatisticsFromExpression(HiveConf conf, Statis ExprNodeGenericFuncDesc engfd = (ExprNodeGenericFuncDesc) end; colName = engfd.getName(); colType = engfd.getTypeString(); - countDistincts = numRows; + countDistincts = getNDVFor(engfd, numRows, parentStats); oi = engfd.getWritableObjectInspector(); } else if (end instanceof ExprNodeColumnListDesc) { // column list ExprNodeColumnListDesc encd = (ExprNodeColumnListDesc) end; colName = Joiner.on(",").join(encd.getCols()); - colType = "array"; + colType = serdeConstants.LIST_TYPE_NAME; countDistincts = numRows; oi = encd.getWritableObjectInspector(); } else if (end instanceof ExprNodeFieldDesc) { @@ -1305,6 +1312,49 @@ public static ColStatistics getColStatisticsFromExpression(HiveConf conf, Statis return colStats; } + + public static Long addWithExpDecay (List distinctVals) { + // Exponential back-off for NDVs. + // 1) Descending order sort of NDVs + // 2) denominator = NDV1 * (NDV2 ^ (1/2)) * (NDV3 ^ (1/4))) * .... + Collections.sort(distinctVals, Collections.reverseOrder()); + + long denom = distinctVals.get(0); + for (int i = 1; i < distinctVals.size(); i++) { + denom = (long) (denom * Math.pow(distinctVals.get(i), 1.0 / (1 << i))); + } + + return denom; + } + + private static long getNDVFor(ExprNodeGenericFuncDesc engfd, long numRows, Statistics parentStats) { + + GenericUDF udf = engfd.getGenericUDF(); + if (!FunctionRegistry.isDeterministic(udf)){ + return numRows; + } + List ndvs = Lists.newArrayList(); + Class udfClass = udf instanceof GenericUDFBridge ? ((GenericUDFBridge) udf).getUdfClass() : udf.getClass(); + NDV ndv = AnnotationUtils.getAnnotation(udfClass, NDV.class); + if (ndv != null) { + // shortcut for udfs for which NDV is bounded + ndvs.add(ndv.ndv()); + } else { + for (String col : engfd.getCols()) { + ColStatistics stats = parentStats.getColumnStatisticsFromColName(col); + if (stats != null) { + ndvs.add(stats.getCountDistint()); + } + } + } + if (ndvs.isEmpty()) { + return numRows; + } else { + long countDistincts = addWithExpDecay(ndvs); + return countDistincts > numRows ? numRows : countDistincts; + } + } + /** * Get number of rows of a give table * @return number of rows diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFMonth.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFMonth.java index 3365804..ffe92e1 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFMonth.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFMonth.java @@ -28,6 +28,7 @@ import org.apache.hadoop.hive.ql.exec.vector.VectorizedExpressions; import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorUDFMonthLong; import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorUDFMonthString; +import org.apache.hadoop.hive.ql.udf.generic.NDV; import org.apache.hadoop.hive.serde2.io.DateWritable; import org.apache.hadoop.hive.serde2.io.HiveIntervalYearMonthWritable; import org.apache.hadoop.hive.serde2.io.TimestampWritable; @@ -48,11 +49,12 @@ + "Example:\n" + " > SELECT _FUNC_('2009-07-30') FROM src LIMIT 1;\n" + " 7") @VectorizedExpressions({VectorUDFMonthLong.class, VectorUDFMonthString.class}) +@NDV(ndv = 31) public class UDFMonth extends UDF { private final SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd"); private final Calendar calendar = Calendar.getInstance(); - private IntWritable result = new IntWritable(); + private final IntWritable result = new IntWritable(); public UDFMonth() { } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFYear.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFYear.java index 34b0c47..e59f31a 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFYear.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFYear.java @@ -28,6 +28,7 @@ import org.apache.hadoop.hive.ql.exec.vector.VectorizedExpressions; import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorUDFYearLong; import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorUDFYearString; +import org.apache.hadoop.hive.ql.udf.generic.NDV; import org.apache.hadoop.hive.serde2.io.DateWritable; import org.apache.hadoop.hive.serde2.io.HiveIntervalYearMonthWritable; import org.apache.hadoop.hive.serde2.io.TimestampWritable; @@ -48,6 +49,7 @@ + "Example:\n " + " > SELECT _FUNC_('2009-07-30') FROM src LIMIT 1;\n" + " 2009") @VectorizedExpressions({VectorUDFYearLong.class, VectorUDFYearString.class}) +@NDV(ndv = 20) // although technically its unbounded, its unlikely we will ever see ndv > 20 public class UDFYear extends UDF { private final SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd"); private final Calendar calendar = Calendar.getInstance(); diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFCurrentDate.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFCurrentDate.java index 67f3c64..24bbda1 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFCurrentDate.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFCurrentDate.java @@ -18,6 +18,7 @@ package org.apache.hadoop.hive.ql.udf.generic; import java.sql.Date; + import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDFArgumentException; import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException; @@ -34,6 +35,7 @@ @Description(name = "current_date", value = "_FUNC_() - Returns the current date at the start of query evaluation." + " All calls of current_date within the same query return the same value.") +@NDV(ndv = 1) public class GenericUDFCurrentDate extends GenericUDF { protected DateWritable currentDate; diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/NDV.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/NDV.java new file mode 100644 index 0000000..91b4458 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/NDV.java @@ -0,0 +1,23 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.udf.generic; + +public @interface NDV { + + long ndv(); +} diff --git a/ql/src/test/results/clientpositive/cbo_rp_auto_join0.q.out b/ql/src/test/results/clientpositive/cbo_rp_auto_join0.q.out index 7822ad9..878175f 100644 --- a/ql/src/test/results/clientpositive/cbo_rp_auto_join0.q.out +++ b/ql/src/test/results/clientpositive/cbo_rp_auto_join0.q.out @@ -71,22 +71,22 @@ STAGE PLANS: 0 1 outputColumnNames: _col0, _col1, _col5, _col6 - Statistics: Num rows: 18 Data size: 6120 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 36 Data size: 12240 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: _col0 (type: string), _col1 (type: string), _col5 (type: string), _col6 (type: string) outputColumnNames: _col0, _col1, _col2, _col3 - Statistics: Num rows: 18 Data size: 6120 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 36 Data size: 12240 Basic stats: COMPLETE Column stats: COMPLETE Reduce Output Operator key expressions: _col0 (type: string), _col1 (type: string), _col2 (type: string), _col3 (type: string) sort order: ++++ - Statistics: Num rows: 18 Data size: 6120 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 36 Data size: 12240 Basic stats: COMPLETE Column stats: COMPLETE Local Work: Map Reduce Local Work Reduce Operator Tree: Select Operator expressions: KEY.reducesinkkey0 (type: string), KEY.reducesinkkey1 (type: string), KEY.reducesinkkey2 (type: string), KEY.reducesinkkey3 (type: string) outputColumnNames: _col0, _col1, _col2, _col3 - Statistics: Num rows: 18 Data size: 6120 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 36 Data size: 12240 Basic stats: COMPLETE Column stats: COMPLETE Group By Operator aggregations: sum(hash(_col0,_col1,_col2,_col3)) mode: hash @@ -204,22 +204,22 @@ STAGE PLANS: 0 1 outputColumnNames: _col0, _col1, _col5, _col6 - Statistics: Num rows: 18 Data size: 6120 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 36 Data size: 12240 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: _col0 (type: string), _col1 (type: string), _col5 (type: string), _col6 (type: string) outputColumnNames: _col0, _col1, _col2, _col3 - Statistics: Num rows: 18 Data size: 6120 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 36 Data size: 12240 Basic stats: COMPLETE Column stats: COMPLETE Reduce Output Operator key expressions: _col0 (type: string), _col1 (type: string), _col2 (type: string), _col3 (type: string) sort order: ++++ - Statistics: Num rows: 18 Data size: 6120 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 36 Data size: 12240 Basic stats: COMPLETE Column stats: COMPLETE Local Work: Map Reduce Local Work Reduce Operator Tree: Select Operator expressions: KEY.reducesinkkey0 (type: string), KEY.reducesinkkey1 (type: string), KEY.reducesinkkey2 (type: string), KEY.reducesinkkey3 (type: string) outputColumnNames: _col0, _col1, _col2, _col3 - Statistics: Num rows: 18 Data size: 6120 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 36 Data size: 12240 Basic stats: COMPLETE Column stats: COMPLETE Group By Operator aggregations: sum(hash(_col0,_col1,_col2,_col3)) mode: hash diff --git a/ql/src/test/results/clientpositive/tez/explainuser_1.q.out b/ql/src/test/results/clientpositive/tez/explainuser_1.q.out index a3d1f87..2b7eac6 100644 --- a/ql/src/test/results/clientpositive/tez/explainuser_1.q.out +++ b/ql/src/test/results/clientpositive/tez/explainuser_1.q.out @@ -2954,11 +2954,11 @@ Stage-0 Statistics:Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE Select Operator [SEL_10] outputColumnNames:["_col0"] - Statistics:Num rows: 200 Data size: 0 Basic stats: PARTIAL Column stats: COMPLETE + Statistics:Num rows: 400 Data size: 0 Basic stats: PARTIAL Column stats: COMPLETE Merge Join Operator [MERGEJOIN_19] | condition map:[{"":"Inner Join 0 to 1"}] | keys:{} - | Statistics:Num rows: 200 Data size: 0 Basic stats: PARTIAL Column stats: COMPLETE + | Statistics:Num rows: 400 Data size: 0 Basic stats: PARTIAL Column stats: COMPLETE |<-Map 1 [SIMPLE_EDGE] | Reduce Output Operator [RS_7] | sort order: @@ -5175,13 +5175,13 @@ Stage-0 Reducer 2 File Output Operator [FS_9] compressed:true - Statistics:Num rows: 125000 Data size: 10875000 Basic stats: COMPLETE Column stats: COMPLETE + Statistics:Num rows: 250000 Data size: 21750000 Basic stats: COMPLETE Column stats: COMPLETE table:{"input format:":"org.apache.hadoop.mapred.TextInputFormat","output format:":"org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat","serde:":"org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe"} Merge Join Operator [MERGEJOIN_11] | condition map:[{"":"Inner Join 0 to 1"}] | keys:{} | outputColumnNames:["_col0"] - | Statistics:Num rows: 125000 Data size: 10875000 Basic stats: COMPLETE Column stats: COMPLETE + | Statistics:Num rows: 250000 Data size: 21750000 Basic stats: COMPLETE Column stats: COMPLETE |<-Map 1 [SIMPLE_EDGE] | Reduce Output Operator [RS_5] | sort order: @@ -5417,21 +5417,21 @@ Stage-0 Reducer 3 File Output Operator [FS_12] compressed:true - Statistics:Num rows: 13778 Data size: 4904968 Basic stats: COMPLETE Column stats: COMPLETE + Statistics:Num rows: 27556 Data size: 9809936 Basic stats: COMPLETE Column stats: COMPLETE table:{"input format:":"org.apache.hadoop.mapred.TextInputFormat","output format:":"org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat","serde:":"org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe"} Select Operator [SEL_11] | outputColumnNames:["_col0","_col1","_col2","_col3"] - | Statistics:Num rows: 13778 Data size: 4904968 Basic stats: COMPLETE Column stats: COMPLETE + | Statistics:Num rows: 27556 Data size: 9809936 Basic stats: COMPLETE Column stats: COMPLETE |<-Reducer 2 [SIMPLE_EDGE] Reduce Output Operator [RS_10] key expressions:_col0 (type: string), _col1 (type: string), _col2 (type: string), _col3 (type: string) sort order:++++ - Statistics:Num rows: 13778 Data size: 4904968 Basic stats: COMPLETE Column stats: COMPLETE + Statistics:Num rows: 27556 Data size: 9809936 Basic stats: COMPLETE Column stats: COMPLETE Merge Join Operator [MERGEJOIN_15] | condition map:[{"":"Inner Join 0 to 1"}] | keys:{} | outputColumnNames:["_col0","_col1","_col2","_col3"] - | Statistics:Num rows: 13778 Data size: 4904968 Basic stats: COMPLETE Column stats: COMPLETE + | Statistics:Num rows: 27556 Data size: 9809936 Basic stats: COMPLETE Column stats: COMPLETE |<-Map 1 [SIMPLE_EDGE] | Reduce Output Operator [RS_6] | sort order: