diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java index bc17fec..900115e 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java @@ -665,7 +665,24 @@ else if (udaf instanceof GenericUDAFCount) { ois.add(TypeInfoUtils.getStandardJavaObjectInspectorFromTypeInfo(colInfo.getType())); } } else { - int aggrPos = 0; + // in return path, we may have aggr($f0), aggr($f1) in GBY + // and then select aggr($f1), aggr($f0) in SEL. + // Thus we need to use colExp to find out which position is + // corresponding to which position. + Map nameToIndex = new HashMap<>(); + for (int index = 0; index < cgbyOp.getConf().getOutputColumnNames().size(); index++) { + nameToIndex.put(cgbyOp.getConf().getOutputColumnNames().get(index), index); + } + List outputColumnNames = cselOp.getConf().getOutputColumnNames(); + Map cselOpTocgbyOp = new HashMap<>(); + for (int index = 0; index < outputColumnNames.size(); index++) { + if (!posToConstant.containsKey(index)) { + String outputColumnName = outputColumnNames.get(index); + ExprNodeColumnDesc exprColumnNodeDesc = (ExprNodeColumnDesc) cselOp + .getColumnExprMap().get(outputColumnName); + cselOpTocgbyOp.put(index, nameToIndex.get(exprColumnNodeDesc.getColumn())); + } + } List oneRowWithConstant = new ArrayList<>(); for (int pos = 0; pos < cselOp.getSchema().getSignature().size(); pos++) { if (posToConstant.containsKey(pos)) { @@ -673,7 +690,7 @@ else if (udaf instanceof GenericUDAFCount) { oneRowWithConstant.add(posToConstant.get(pos)); } else { // This position is an aggregation. - oneRowWithConstant.add(oneRow.get(aggrPos++)); + oneRowWithConstant.add(oneRow.get(cselOpTocgbyOp.get(pos))); } ColumnInfo colInfo = cselOp.getSchema().getSignature().get(pos); colNames.add(colInfo.getInternalName()); diff --git a/ql/src/test/queries/clientpositive/cbo_rp_udf_udaf_stats_opt.q b/ql/src/test/queries/clientpositive/cbo_rp_udf_udaf_stats_opt.q new file mode 100644 index 0000000..8d3aac8 --- /dev/null +++ b/ql/src/test/queries/clientpositive/cbo_rp_udf_udaf_stats_opt.q @@ -0,0 +1,22 @@ +set hive.mapred.mode=nonstrict; +set hive.cbo.enable=true; +set hive.cbo.returnpath.hiveop=true; +set hive.exec.check.crossproducts=false; +set hive.compute.query.using.stats=true + +set hive.stats.fetch.column.stats=true; +set hive.auto.convert.join=false; + +-- SORT_QUERY_RESULTS + +-- 8. Test UDF/UDAF +select count(*), count(c_int), sum(c_int), avg(c_int), max(c_int), min(c_int) from cbo_t1; +select count(*), count(c_int) as a, sum(c_int), avg(c_int), max(c_int), min(c_int), case c_int when 0 then 1 when 1 then 2 else 3 end, sum(case c_int when 0 then 1 when 1 then 2 else 3 end) from cbo_t1 group by c_int order by a; +select * from (select count(*) as a, count(distinct c_int) as b, sum(c_int) as c, avg(c_int) as d, max(c_int) as e, min(c_int) as f from cbo_t1) cbo_t1; +select * from (select count(*) as a, count(distinct c_int) as b, sum(c_int) as c, avg(c_int) as d, max(c_int) as e, min(c_int) as f, case c_int when 0 then 1 when 1 then 2 else 3 end as g, sum(case c_int when 0 then 1 when 1 then 2 else 3 end) as h from cbo_t1 group by c_int) cbo_t1 order by a; +select f,a,e,b from (select count(*) as a, count(c_int) as b, sum(c_int) as c, avg(c_int) as d, max(c_int) as e, min(c_int) as f from cbo_t1) cbo_t1; +select f,a,e,b from (select count(*) as a, count(distinct c_int) as b, sum(distinct c_int) as c, avg(distinct c_int) as d, max(distinct c_int) as e, min(distinct c_int) as f from cbo_t1) cbo_t1; +select key,count(c_int) as a, avg(c_float) from cbo_t1 group by key order by a; +select count(distinct c_int) as a, avg(c_float) from cbo_t1 group by c_float order by a; +select count(distinct c_int) as a, avg(c_float) from cbo_t1 group by c_int order by a; +select count(distinct c_int) as a, avg(c_float) from cbo_t1 group by c_float, c_int order by a; diff --git a/ql/src/test/results/clientpositive/cbo_rp_udf_udaf_stats_opt.q.out b/ql/src/test/results/clientpositive/cbo_rp_udf_udaf_stats_opt.q.out new file mode 100644 index 0000000..a1e7fd8 --- /dev/null +++ b/ql/src/test/results/clientpositive/cbo_rp_udf_udaf_stats_opt.q.out @@ -0,0 +1,126 @@ +Warning: Value had a \n character in it. +PREHOOK: query: -- SORT_QUERY_RESULTS + +-- 8. Test UDF/UDAF +select count(*), count(c_int), sum(c_int), avg(c_int), max(c_int), min(c_int) from cbo_t1 +PREHOOK: type: QUERY +PREHOOK: Input: default@cbo_t1 +PREHOOK: Input: default@cbo_t1@dt=2014 +#### A masked pattern was here #### +POSTHOOK: query: -- SORT_QUERY_RESULTS + +-- 8. Test UDF/UDAF +select count(*), count(c_int), sum(c_int), avg(c_int), max(c_int), min(c_int) from cbo_t1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@cbo_t1 +POSTHOOK: Input: default@cbo_t1@dt=2014 +#### A masked pattern was here #### +20 18 18 1.0 1 1 +PREHOOK: query: select count(*), count(c_int) as a, sum(c_int), avg(c_int), max(c_int), min(c_int), case c_int when 0 then 1 when 1 then 2 else 3 end, sum(case c_int when 0 then 1 when 1 then 2 else 3 end) from cbo_t1 group by c_int order by a +PREHOOK: type: QUERY +PREHOOK: Input: default@cbo_t1 +PREHOOK: Input: default@cbo_t1@dt=2014 +#### A masked pattern was here #### +POSTHOOK: query: select count(*), count(c_int) as a, sum(c_int), avg(c_int), max(c_int), min(c_int), case c_int when 0 then 1 when 1 then 2 else 3 end, sum(case c_int when 0 then 1 when 1 then 2 else 3 end) from cbo_t1 group by c_int order by a +POSTHOOK: type: QUERY +POSTHOOK: Input: default@cbo_t1 +POSTHOOK: Input: default@cbo_t1@dt=2014 +#### A masked pattern was here #### +18 18 18 1.0 1 1 2 36 +2 0 NULL NULL NULL NULL 3 6 +PREHOOK: query: select * from (select count(*) as a, count(distinct c_int) as b, sum(c_int) as c, avg(c_int) as d, max(c_int) as e, min(c_int) as f from cbo_t1) cbo_t1 +PREHOOK: type: QUERY +PREHOOK: Input: default@cbo_t1 +PREHOOK: Input: default@cbo_t1@dt=2014 +#### A masked pattern was here #### +POSTHOOK: query: select * from (select count(*) as a, count(distinct c_int) as b, sum(c_int) as c, avg(c_int) as d, max(c_int) as e, min(c_int) as f from cbo_t1) cbo_t1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@cbo_t1 +POSTHOOK: Input: default@cbo_t1@dt=2014 +#### A masked pattern was here #### +20 1 18 1.0 1 1 +PREHOOK: query: select * from (select count(*) as a, count(distinct c_int) as b, sum(c_int) as c, avg(c_int) as d, max(c_int) as e, min(c_int) as f, case c_int when 0 then 1 when 1 then 2 else 3 end as g, sum(case c_int when 0 then 1 when 1 then 2 else 3 end) as h from cbo_t1 group by c_int) cbo_t1 order by a +PREHOOK: type: QUERY +PREHOOK: Input: default@cbo_t1 +PREHOOK: Input: default@cbo_t1@dt=2014 +#### A masked pattern was here #### +POSTHOOK: query: select * from (select count(*) as a, count(distinct c_int) as b, sum(c_int) as c, avg(c_int) as d, max(c_int) as e, min(c_int) as f, case c_int when 0 then 1 when 1 then 2 else 3 end as g, sum(case c_int when 0 then 1 when 1 then 2 else 3 end) as h from cbo_t1 group by c_int) cbo_t1 order by a +POSTHOOK: type: QUERY +POSTHOOK: Input: default@cbo_t1 +POSTHOOK: Input: default@cbo_t1@dt=2014 +#### A masked pattern was here #### +18 1 18 1.0 1 1 2 36 +2 0 NULL NULL NULL NULL 3 6 +PREHOOK: query: select f,a,e,b from (select count(*) as a, count(c_int) as b, sum(c_int) as c, avg(c_int) as d, max(c_int) as e, min(c_int) as f from cbo_t1) cbo_t1 +PREHOOK: type: QUERY +PREHOOK: Input: default@cbo_t1 +PREHOOK: Input: default@cbo_t1@dt=2014 +#### A masked pattern was here #### +POSTHOOK: query: select f,a,e,b from (select count(*) as a, count(c_int) as b, sum(c_int) as c, avg(c_int) as d, max(c_int) as e, min(c_int) as f from cbo_t1) cbo_t1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@cbo_t1 +POSTHOOK: Input: default@cbo_t1@dt=2014 +#### A masked pattern was here #### +1 20 1 18 +PREHOOK: query: select f,a,e,b from (select count(*) as a, count(distinct c_int) as b, sum(distinct c_int) as c, avg(distinct c_int) as d, max(distinct c_int) as e, min(distinct c_int) as f from cbo_t1) cbo_t1 +PREHOOK: type: QUERY +PREHOOK: Input: default@cbo_t1 +PREHOOK: Input: default@cbo_t1@dt=2014 +#### A masked pattern was here #### +POSTHOOK: query: select f,a,e,b from (select count(*) as a, count(distinct c_int) as b, sum(distinct c_int) as c, avg(distinct c_int) as d, max(distinct c_int) as e, min(distinct c_int) as f from cbo_t1) cbo_t1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@cbo_t1 +POSTHOOK: Input: default@cbo_t1@dt=2014 +#### A masked pattern was here #### +1 20 1 1 +PREHOOK: query: select key,count(c_int) as a, avg(c_float) from cbo_t1 group by key order by a +PREHOOK: type: QUERY +PREHOOK: Input: default@cbo_t1 +PREHOOK: Input: default@cbo_t1@dt=2014 +#### A masked pattern was here #### +POSTHOOK: query: select key,count(c_int) as a, avg(c_float) from cbo_t1 group by key order by a +POSTHOOK: type: QUERY +POSTHOOK: Input: default@cbo_t1 +POSTHOOK: Input: default@cbo_t1@dt=2014 +#### A masked pattern was here #### + 1 2 1.0 + 1 2 1.0 +1 12 1.0 +1 2 1.0 +NULL 0 NULL +PREHOOK: query: select count(distinct c_int) as a, avg(c_float) from cbo_t1 group by c_float order by a +PREHOOK: type: QUERY +PREHOOK: Input: default@cbo_t1 +PREHOOK: Input: default@cbo_t1@dt=2014 +#### A masked pattern was here #### +POSTHOOK: query: select count(distinct c_int) as a, avg(c_float) from cbo_t1 group by c_float order by a +POSTHOOK: type: QUERY +POSTHOOK: Input: default@cbo_t1 +POSTHOOK: Input: default@cbo_t1@dt=2014 +#### A masked pattern was here #### +0 NULL +1 1.0 +PREHOOK: query: select count(distinct c_int) as a, avg(c_float) from cbo_t1 group by c_int order by a +PREHOOK: type: QUERY +PREHOOK: Input: default@cbo_t1 +PREHOOK: Input: default@cbo_t1@dt=2014 +#### A masked pattern was here #### +POSTHOOK: query: select count(distinct c_int) as a, avg(c_float) from cbo_t1 group by c_int order by a +POSTHOOK: type: QUERY +POSTHOOK: Input: default@cbo_t1 +POSTHOOK: Input: default@cbo_t1@dt=2014 +#### A masked pattern was here #### +0 NULL +1 1.0 +PREHOOK: query: select count(distinct c_int) as a, avg(c_float) from cbo_t1 group by c_float, c_int order by a +PREHOOK: type: QUERY +PREHOOK: Input: default@cbo_t1 +PREHOOK: Input: default@cbo_t1@dt=2014 +#### A masked pattern was here #### +POSTHOOK: query: select count(distinct c_int) as a, avg(c_float) from cbo_t1 group by c_float, c_int order by a +POSTHOOK: type: QUERY +POSTHOOK: Input: default@cbo_t1 +POSTHOOK: Input: default@cbo_t1@dt=2014 +#### A masked pattern was here #### +0 NULL +1 1.0