diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java index 6a1c210c2e..0258e36738 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java @@ -84,7 +84,12 @@ import org.apache.hadoop.hive.ql.plan.mapper.StatsSource; import org.apache.hadoop.hive.ql.stats.OperatorStats; import org.apache.hadoop.hive.ql.stats.StatsUtils; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFCount; import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFMax; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFMin; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFResolver; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFSum; import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFBetween; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFIn; @@ -1479,6 +1484,7 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, // if UDAFs are present, new columns needs to be added if (!aggDesc.isEmpty() && stats != null) { List aggColStats = Lists.newArrayList(); + int idx = 0; for (ColumnInfo ci : rs.getSignature()) { // if the columns in row schema is not contained in column @@ -1492,6 +1498,7 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, cs.setCountDistint(stats.getNumRows()); cs.setNumNulls(0); cs.setAvgColLen(StatsUtils.getAvgColLenOf(conf, ci.getObjectInspector(), colType)); + computeAggregateColumnMinMax(cs, conf, aggDesc.get(idx++), colType, parentStats); aggColStats.add(cs); } } @@ -1524,6 +1531,77 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, return null; } + /** + * If possible, sets the min / max value for the column based on the aggregate function + * being calculated and its input. + */ + private static void computeAggregateColumnMinMax(ColStatistics cs, HiveConf conf, AggregationDesc agg, String aggType, + Statistics parentStats) throws SemanticException { + if (agg.getParameters() != null && agg.getParameters().size() == 1) { + ColStatistics parentCS = StatsUtils.getColStatisticsFromExpression( + conf, parentStats, agg.getParameters().get(0)); + if (parentCS != null && parentCS.getRange() != null && + parentCS.getRange().minValue != null && parentCS.getRange().maxValue != null) { + long valuesCount = agg.getDistinct() ? + parentCS.getCountDistint() : + parentStats.getNumRows() - parentCS.getNumNulls(); + Range range = parentCS.getRange(); + // Get the aggregate function matching the name in the query. + GenericUDAFResolver udaf = + FunctionRegistry.getGenericUDAFResolver(agg.getGenericUDAFName()); + if (udaf instanceof GenericUDAFCount) { + cs.setRange(new Range(0, valuesCount)); + } else if (udaf instanceof GenericUDAFMax || udaf instanceof GenericUDAFMin) { + cs.setRange(new Range(range.minValue, range.maxValue)); + } else if (udaf instanceof GenericUDAFSum) { + switch (aggType) { + case serdeConstants.TINYINT_TYPE_NAME: + case serdeConstants.SMALLINT_TYPE_NAME: + case serdeConstants.DATE_TYPE_NAME: + case serdeConstants.INT_TYPE_NAME: + case serdeConstants.BIGINT_TYPE_NAME: + long maxValueLong = range.maxValue.longValue(); + long minValueLong = range.minValue.longValue(); + // If min value is less or equal to max value (legal) + if (minValueLong <= maxValueLong && minValueLong >= 0) { + // min = minValue, max = (minValue + maxValue) * 0.5 * parentNumRows + cs.setRange(new Range( + minValueLong, + StatsUtils.safeMult( + StatsUtils.safeMult(StatsUtils.safeAdd(minValueLong, maxValueLong), 0.5), + valuesCount))); + } + break; + case serdeConstants.FLOAT_TYPE_NAME: + case serdeConstants.DOUBLE_TYPE_NAME: + double maxValueDouble = range.maxValue.doubleValue(); + double minValueDouble = range.minValue.doubleValue(); + // If min value is less or equal to max value (legal) + if (minValueDouble <= maxValueDouble && minValueDouble >= 0) { + // min = minValue, max = (minValue + maxValue) * 0.5 * parentNumRows + cs.setRange(new Range( + minValueDouble, + (minValueDouble + maxValueDouble) * 0.5 * valuesCount)); + } + break; + default: + if (aggType.startsWith(serdeConstants.DECIMAL_TYPE_NAME)) { + BigDecimal maxValueBD = new BigDecimal(range.maxValue.toString()); + BigDecimal minValueBD = new BigDecimal(range.minValue.toString()); + // If min value is less or equal to max value (legal) + if (minValueBD.compareTo(maxValueBD) <= 0 && minValueBD.compareTo(BigDecimal.ZERO) >= 0) { + // min = minValue, max = (minValue + maxValue) * 0.5 * parentNumRows + cs.setRange(new Range( + minValueBD, + minValueBD.add(maxValueBD).multiply(new BigDecimal(0.5)).multiply(new BigDecimal(valuesCount)))); + } + } + } + } + } + } + } + private long getParentNumRows(GroupByOperator op, List gbyKeys, HiveConf conf) { if(gbyKeys == null || gbyKeys.isEmpty()) { return op.getParentOperators().get(0).getStatistics().getNumRows(); diff --git a/ql/src/test/results/clientpositive/groupby_grouping_window.q.out b/ql/src/test/results/clientpositive/groupby_grouping_window.q.out index e6cc459596..7f687da45b 100644 --- a/ql/src/test/results/clientpositive/groupby_grouping_window.q.out +++ b/ql/src/test/results/clientpositive/groupby_grouping_window.q.out @@ -75,7 +75,7 @@ STAGE PLANS: pruneGroupingSetId: true Filter Operator predicate: (_col3 > 0) (type: boolean) - Statistics: Num rows: 1 Data size: 20 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 3 Data size: 60 Basic stats: COMPLETE Column stats: COMPLETE File Output Operator compressed: false table: @@ -91,14 +91,14 @@ STAGE PLANS: key expressions: _col0 (type: int), _col3 (type: int) sort order: ++ Map-reduce partition columns: _col0 (type: int) - Statistics: Num rows: 1 Data size: 20 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 3 Data size: 60 Basic stats: COMPLETE Column stats: COMPLETE value expressions: _col2 (type: int) Execution mode: vectorized Reduce Operator Tree: Select Operator expressions: KEY.reducesinkkey0 (type: int), VALUE._col1 (type: int), KEY.reducesinkkey1 (type: int) outputColumnNames: _col0, _col2, _col3 - Statistics: Num rows: 1 Data size: 20 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 3 Data size: 60 Basic stats: COMPLETE Column stats: COMPLETE PTF Operator Function definitions: Input definition @@ -119,14 +119,14 @@ STAGE PLANS: window function: GenericUDAFRankEvaluator window frame: ROWS PRECEDING(MAX)~FOLLOWING(MAX) isPivotResult: true - Statistics: Num rows: 1 Data size: 20 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 3 Data size: 60 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: _col0 (type: int), _col2 (type: int), _col3 (type: int), rank_window_0 (type: int) outputColumnNames: _col0, _col1, _col2, _col3 - Statistics: Num rows: 1 Data size: 16 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 3 Data size: 48 Basic stats: COMPLETE Column stats: COMPLETE File Output Operator compressed: false - Statistics: Num rows: 1 Data size: 16 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 3 Data size: 48 Basic stats: COMPLETE Column stats: COMPLETE table: input format: org.apache.hadoop.mapred.SequenceFileInputFormat output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat diff --git a/ql/src/test/results/clientpositive/llap/subquery_scalar.q.out b/ql/src/test/results/clientpositive/llap/subquery_scalar.q.out index 0d8ff14b46..5817f9811c 100644 --- a/ql/src/test/results/clientpositive/llap/subquery_scalar.q.out +++ b/ql/src/test/results/clientpositive/llap/subquery_scalar.q.out @@ -1500,7 +1500,7 @@ STAGE PLANS: keys: 0 _col0 (type: string) 1 _col0 (type: string) - Statistics: Num rows: 4 Data size: 32 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 13 Data size: 104 Basic stats: COMPLETE Column stats: COMPLETE Group By Operator aggregations: count() minReductionHashAggr: 0.0 @@ -1537,16 +1537,16 @@ STAGE PLANS: Statistics: Num rows: 13 Data size: 1625 Basic stats: COMPLETE Column stats: COMPLETE Filter Operator predicate: (_col1 > 100) (type: boolean) - Statistics: Num rows: 4 Data size: 500 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 13 Data size: 1625 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: _col0 (type: string) outputColumnNames: _col0 - Statistics: Num rows: 4 Data size: 484 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 13 Data size: 1573 Basic stats: COMPLETE Column stats: COMPLETE Reduce Output Operator key expressions: _col0 (type: string) sort order: + Map-reduce partition columns: _col0 (type: string) - Statistics: Num rows: 4 Data size: 484 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 13 Data size: 1573 Basic stats: COMPLETE Column stats: COMPLETE Stage: Stage-0 Fetch Operator diff --git a/ql/src/test/results/clientpositive/llap/subquery_select.q.out b/ql/src/test/results/clientpositive/llap/subquery_select.q.out index fc70407240..d58905cea3 100644 --- a/ql/src/test/results/clientpositive/llap/subquery_select.q.out +++ b/ql/src/test/results/clientpositive/llap/subquery_select.q.out @@ -5057,16 +5057,16 @@ STAGE PLANS: Statistics: Num rows: 13 Data size: 208 Basic stats: COMPLETE Column stats: COMPLETE Filter Operator predicate: (_col2 > 0L) (type: boolean) - Statistics: Num rows: 4 Data size: 64 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 13 Data size: 208 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: _col1 (type: int) outputColumnNames: _col0 - Statistics: Num rows: 4 Data size: 16 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 13 Data size: 52 Basic stats: COMPLETE Column stats: COMPLETE Reduce Output Operator key expressions: _col0 (type: int) sort order: + Map-reduce partition columns: _col0 (type: int) - Statistics: Num rows: 4 Data size: 16 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 13 Data size: 52 Basic stats: COMPLETE Column stats: COMPLETE Reducer 2 Execution mode: llap Reduce Operator Tree: @@ -5534,16 +5534,16 @@ STAGE PLANS: Statistics: Num rows: 13 Data size: 208 Basic stats: COMPLETE Column stats: COMPLETE Filter Operator predicate: (_col2 > 0L) (type: boolean) - Statistics: Num rows: 4 Data size: 64 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 13 Data size: 208 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: _col1 (type: int) outputColumnNames: _col0 - Statistics: Num rows: 4 Data size: 16 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 13 Data size: 52 Basic stats: COMPLETE Column stats: COMPLETE Reduce Output Operator key expressions: _col0 (type: int) sort order: + Map-reduce partition columns: _col0 (type: int) - Statistics: Num rows: 4 Data size: 16 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 13 Data size: 52 Basic stats: COMPLETE Column stats: COMPLETE Reducer 6 Execution mode: llap Reduce Operator Tree: diff --git a/ql/src/test/results/clientpositive/llap/vector_groupby_grouping_window.q.out b/ql/src/test/results/clientpositive/llap/vector_groupby_grouping_window.q.out index 3a9ea79349..5e391bf568 100644 --- a/ql/src/test/results/clientpositive/llap/vector_groupby_grouping_window.q.out +++ b/ql/src/test/results/clientpositive/llap/vector_groupby_grouping_window.q.out @@ -151,7 +151,7 @@ STAGE PLANS: native: true predicateExpression: FilterLongColGreaterLongScalar(col 2:int, val 0) predicate: (_col3 > 0) (type: boolean) - Statistics: Num rows: 1 Data size: 20 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 3 Data size: 60 Basic stats: COMPLETE Column stats: COMPLETE Reduce Output Operator key expressions: _col0 (type: int), _col3 (type: int) sort order: ++ @@ -163,7 +163,7 @@ STAGE PLANS: nativeConditionsMet: hive.vectorized.execution.reducesink.new.enabled IS true, hive.execution.engine tez IN [tez, spark] IS true, No PTF TopN IS true, No DISTINCT columns IS true, BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true partitionColumns: 0:int valueColumns: 1:int - Statistics: Num rows: 1 Data size: 20 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 3 Data size: 60 Basic stats: COMPLETE Column stats: COMPLETE value expressions: _col2 (type: int) Reducer 3 Execution mode: vectorized, llap @@ -188,7 +188,7 @@ STAGE PLANS: className: VectorSelectOperator native: true projectedOutputColumnNums: [0, 2, 1] - Statistics: Num rows: 1 Data size: 20 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 3 Data size: 60 Basic stats: COMPLETE Column stats: COMPLETE PTF Operator Function definitions: Input definition @@ -222,7 +222,7 @@ STAGE PLANS: outputTypes: [int, int, int, int] partitionExpressions: [col 0:int] streamingColumns: [3] - Statistics: Num rows: 1 Data size: 20 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 3 Data size: 60 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: _col0 (type: int), _col2 (type: int), _col3 (type: int), rank_window_0 (type: int) outputColumnNames: _col0, _col1, _col2, _col3 @@ -230,13 +230,13 @@ STAGE PLANS: className: VectorSelectOperator native: true projectedOutputColumnNums: [0, 2, 1, 3] - Statistics: Num rows: 1 Data size: 16 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 3 Data size: 48 Basic stats: COMPLETE Column stats: COMPLETE File Output Operator compressed: false File Sink Vectorization: className: VectorFileSinkOperator native: false - Statistics: Num rows: 1 Data size: 16 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 3 Data size: 48 Basic stats: COMPLETE Column stats: COMPLETE table: input format: org.apache.hadoop.mapred.SequenceFileInputFormat output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat diff --git a/ql/src/test/results/clientpositive/perf/tez/constraints/query78.q.out b/ql/src/test/results/clientpositive/perf/tez/constraints/query78.q.out index 888d33557b..792540fe49 100644 --- a/ql/src/test/results/clientpositive/perf/tez/constraints/query78.q.out +++ b/ql/src/test/results/clientpositive/perf/tez/constraints/query78.q.out @@ -158,18 +158,18 @@ Stage-0 File Output Operator [FS_269] Limit [LIM_268] (rows=100 width=484) Number of rows:100 - Select Operator [SEL_267] (rows=203549242538 width=483) + Select Operator [SEL_267] (rows=1831943309558 width=483) Output:["_col0","_col1","_col2","_col3","_col4","_col5","_col6","_col7","_col8","_col9"] <-Reducer 5 [SIMPLE_EDGE] SHUFFLE [RS_73] - Select Operator [SEL_72] (rows=203549242538 width=719) + Select Operator [SEL_72] (rows=1831943309558 width=719) Output:["_col0","_col1","_col6","_col7","_col8","_col9","_col10","_col11","_col12"] - Merge Join Operator [MERGEJOIN_220] (rows=203549242538 width=703) + Merge Join Operator [MERGEJOIN_220] (rows=1831943309558 width=703) Conds:RS_69._col0, _col1=RS_266._col0, _col1(Inner),Output:["_col0","_col1","_col2","_col3","_col4","_col6","_col7","_col8","_col11","_col12","_col13"] <-Reducer 12 [ONE_TO_ONE_EDGE] vectorized FORWARD [RS_266] PartitionCols:_col0, _col1 - Filter Operator [FIL_265] (rows=13513323 width=239) + Filter Operator [FIL_265] (rows=40539971 width=239) predicate:(_col2 > 0L) Group By Operator [GBY_264] (rows=40539971 width=239) Output:["_col0","_col1","_col2","_col3","_col4"],aggregations:["sum(VALUE._col0)","sum(VALUE._col1)","sum(VALUE._col2)"],keys:KEY._col0, KEY._col1 @@ -228,7 +228,7 @@ Stage-0 <-Reducer 4 [SIMPLE_EDGE] SHUFFLE [RS_69] PartitionCols:_col0, _col1 - Merge Join Operator [MERGEJOIN_219] (rows=7613716536 width=471) + Merge Join Operator [MERGEJOIN_219] (rows=22841150061 width=471) Conds:RS_244._col1=RS_256._col0(Inner),Output:["_col0","_col1","_col2","_col3","_col4","_col6","_col7","_col8"] <-Reducer 3 [SIMPLE_EDGE] vectorized SHUFFLE [RS_244] @@ -287,9 +287,9 @@ Stage-0 <-Reducer 9 [SIMPLE_EDGE] vectorized SHUFFLE [RS_256] PartitionCols:_col0 - Select Operator [SEL_255] (rows=33694814 width=235) + Select Operator [SEL_255] (rows=101084444 width=235) Output:["_col0","_col1","_col2","_col3"] - Filter Operator [FIL_254] (rows=33694814 width=239) + Filter Operator [FIL_254] (rows=101084444 width=239) predicate:(_col2 > 0L) Select Operator [SEL_253] (rows=101084444 width=239) Output:["_col1","_col2","_col3","_col4"] diff --git a/ql/src/test/results/clientpositive/perf/tez/query78.q.out b/ql/src/test/results/clientpositive/perf/tez/query78.q.out index e66d6f518a..9ce2cdba16 100644 --- a/ql/src/test/results/clientpositive/perf/tez/query78.q.out +++ b/ql/src/test/results/clientpositive/perf/tez/query78.q.out @@ -158,20 +158,20 @@ Stage-0 File Output Operator [FS_276] Limit [LIM_275] (rows=100 width=484) Number of rows:100 - Select Operator [SEL_274] (rows=203549242531 width=483) + Select Operator [SEL_274] (rows=1831943309424 width=483) Output:["_col0","_col1","_col2","_col3","_col4","_col5","_col6","_col7","_col8","_col9"] <-Reducer 5 [SIMPLE_EDGE] SHUFFLE [RS_76] - Select Operator [SEL_75] (rows=203549242531 width=719) + Select Operator [SEL_75] (rows=1831943309424 width=719) Output:["_col0","_col1","_col2","_col6","_col7","_col8","_col9","_col10","_col11","_col12"] - Merge Join Operator [MERGEJOIN_223] (rows=203549242531 width=715) + Merge Join Operator [MERGEJOIN_223] (rows=1831943309424 width=715) Conds:RS_72._col1=RS_273._col0(Inner),Output:["_col0","_col1","_col2","_col3","_col4","_col7","_col8","_col9","_col11","_col12","_col13","_col14","_col15"] <-Reducer 12 [SIMPLE_EDGE] vectorized SHUFFLE [RS_273] PartitionCols:_col0 - Select Operator [SEL_272] (rows=33694814 width=247) + Select Operator [SEL_272] (rows=101084444 width=247) Output:["_col0","_col1","_col2","_col3","_col4","_col5"] - Filter Operator [FIL_271] (rows=33694814 width=239) + Filter Operator [FIL_271] (rows=101084444 width=239) predicate:(_col2 > 0L) Select Operator [SEL_270] (rows=101084444 width=239) Output:["_col1","_col2","_col3","_col4"] @@ -234,7 +234,7 @@ Stage-0 <-Reducer 4 [SIMPLE_EDGE] SHUFFLE [RS_72] PartitionCols:_col1 - Merge Join Operator [MERGEJOIN_222] (rows=3053485049 width=471) + Merge Join Operator [MERGEJOIN_222] (rows=9160455599 width=471) Conds:RS_248._col1, _col0=RS_260._col1, _col0(Inner),Output:["_col0","_col1","_col2","_col3","_col4","_col7","_col8","_col9"] <-Reducer 3 [ONE_TO_ONE_EDGE] vectorized FORWARD [RS_248] @@ -295,9 +295,9 @@ Stage-0 <-Reducer 9 [ONE_TO_ONE_EDGE] vectorized FORWARD [RS_260] PartitionCols:_col1, _col0 - Select Operator [SEL_259] (rows=13513323 width=239) + Select Operator [SEL_259] (rows=40539971 width=239) Output:["_col0","_col1","_col2","_col3","_col4"] - Filter Operator [FIL_258] (rows=13513323 width=239) + Filter Operator [FIL_258] (rows=40539971 width=239) predicate:(_col2 > 0L) Group By Operator [GBY_257] (rows=40539971 width=239) Output:["_col0","_col1","_col2","_col3","_col4"],aggregations:["sum(VALUE._col0)","sum(VALUE._col1)","sum(VALUE._col2)"],keys:KEY._col0, KEY._col1