diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/correlation/ReduceSinkDeDuplication.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/correlation/ReduceSinkDeDuplication.java index 59c87a3..733620b 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/correlation/ReduceSinkDeDuplication.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/correlation/ReduceSinkDeDuplication.java @@ -312,17 +312,15 @@ protected boolean merge(ReduceSinkOperator cRS, ReduceSinkOperator pRS, int minR if (result[4] > 0) { // This case happens only when pRS key is empty in which case we can use // number of distribution keys and key serialization info from cRS - pRS.getConf().setNumDistributionKeys(cRS.getConf().getNumDistributionKeys()); - List fields = PlanUtils.getFieldSchemasFromColumnList(pRS.getConf() - .getKeyCols(), "reducesinkkey"); - TableDesc keyTable = PlanUtils.getReduceKeyTableDesc(fields, pRS.getConf().getOrder(), - pRS.getConf().getNullOrder()); - ArrayList outputKeyCols = Lists.newArrayList(); - for (int i = 0; i < fields.size(); i++) { - outputKeyCols.add(fields.get(i).getName()); + if (pRS.getConf().getKeyCols() != null && pRS.getConf().getKeyCols().size() == 0 + && cRS.getConf().getKeyCols() != null && cRS.getConf().getKeyCols().size() == 0) { + // As setNumDistributionKeys is a subset of keycols, the size should + // be 0 too. This condition maybe too strict. We may extend it in the + // future. + TableDesc keyTable = PlanUtils.getReduceKeyTableDesc(new ArrayList(), pRS + .getConf().getOrder(), pRS.getConf().getNullOrder()); + pRS.getConf().setKeySerializeInfo(keyTable); } - pRS.getConf().setOutputKeyColumnNames(outputKeyCols); - pRS.getConf().setKeySerializeInfo(keyTable); } return true; } diff --git a/ql/src/test/queries/clientpositive/reduceSinkDeDuplication_pRS_key_empty.q b/ql/src/test/queries/clientpositive/reduceSinkDeDuplication_pRS_key_empty.q new file mode 100644 index 0000000..8bbae39 --- /dev/null +++ b/ql/src/test/queries/clientpositive/reduceSinkDeDuplication_pRS_key_empty.q @@ -0,0 +1,60 @@ +set hive.mapred.mode=nonstrict; +set hive.cbo.enable=false; + +set hive.map.aggr=false; + +set hive.groupby.skewindata=false; +set mapred.reduce.tasks=31; + + +select compute_stats(a,16),compute_stats(b,16),compute_stats(c,16),compute_stats(d,16) +from +( +select + avg(substr(src.value,5)) as a, + max(substr(src.value,5)) as b, + variance(substr(src.value,5)) as c, + var_samp(substr(src.value,5)) as d + from src)subq; + +explain select compute_stats(a,16),compute_stats(b,16),compute_stats(c,16),compute_stats(d,16) +from +( +select + avg(DISTINCT substr(src.value,5)) as a, + max(substr(src.value,5)) as b, + variance(substr(src.value,5)) as c, + var_samp(substr(src.value,5)) as d + from src)subq; + +select compute_stats(a,16),compute_stats(b,16),compute_stats(c,16),compute_stats(d,16) +from +( +select + avg(DISTINCT substr(src.value,5)) as a, + max(substr(src.value,5)) as b, + variance(substr(src.value,5)) as c, + var_samp(substr(src.value,5)) as d + from src)subq; + +set hive.optimize.reducededuplication=false; + +explain select compute_stats(a,16),compute_stats(b,16),compute_stats(c,16),compute_stats(d,16) +from +( +select + avg(DISTINCT substr(src.value,5)) as a, + max(substr(src.value,5)) as b, + variance(substr(src.value,5)) as c, + var_samp(substr(src.value,5)) as d + from src)subq; + +select compute_stats(a,16),compute_stats(b,16),compute_stats(c,16),compute_stats(d,16) +from +( +select + avg(DISTINCT substr(src.value,5)) as a, + max(substr(src.value,5)) as b, + variance(substr(src.value,5)) as c, + var_samp(substr(src.value,5)) as d + from src)subq; diff --git a/ql/src/test/results/clientpositive/reduceSinkDeDuplication_pRS_key_empty.q.out b/ql/src/test/results/clientpositive/reduceSinkDeDuplication_pRS_key_empty.q.out new file mode 100644 index 0000000..4a848f2 --- /dev/null +++ b/ql/src/test/results/clientpositive/reduceSinkDeDuplication_pRS_key_empty.q.out @@ -0,0 +1,220 @@ +PREHOOK: query: select compute_stats(a,16),compute_stats(b,16),compute_stats(c,16),compute_stats(d,16) +from +( +select + avg(substr(src.value,5)) as a, + max(substr(src.value,5)) as b, + variance(substr(src.value,5)) as c, + var_samp(substr(src.value,5)) as d + from src)subq +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: select compute_stats(a,16),compute_stats(b,16),compute_stats(c,16),compute_stats(d,16) +from +( +select + avg(substr(src.value,5)) as a, + max(substr(src.value,5)) as b, + variance(substr(src.value,5)) as c, + var_samp(substr(src.value,5)) as d + from src)subq +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +{"columntype":"Double","min":260.182,"max":260.182,"countnulls":0,"numdistinctvalues":1,"ndvbitvector":"{1}{0}{0}{0}{1}{1}{1}{0}{0}{0}{0}{0}{1}{2}{1}{0}"} {"columntype":"String","maxlength":2,"avglength":2.0,"countnulls":0,"numdistinctvalues":1,"ndvbitvector":"{1}{2}{0}{3}{6}{3}{0}{1}{1}{0}{0}{0}{0}{0}{0}{0}"} {"columntype":"Double","min":20428.07287599998,"max":20428.07287599998,"countnulls":0,"numdistinctvalues":2,"ndvbitvector":"{0}{0}{3}{0}{1}{1}{0}{0}{0}{0}{0}{0}{0}{4}{2}{0}"} {"columntype":"Double","min":20469.01089779557,"max":20469.01089779557,"countnulls":0,"numdistinctvalues":1,"ndvbitvector":"{0}{1}{3}{2}{3}{5}{2}{0}{1}{0}{1}{1}{1}{1}{0}{1}"} +PREHOOK: query: explain select compute_stats(a,16),compute_stats(b,16),compute_stats(c,16),compute_stats(d,16) +from +( +select + avg(DISTINCT substr(src.value,5)) as a, + max(substr(src.value,5)) as b, + variance(substr(src.value,5)) as c, + var_samp(substr(src.value,5)) as d + from src)subq +PREHOOK: type: QUERY +POSTHOOK: query: explain select compute_stats(a,16),compute_stats(b,16),compute_stats(c,16),compute_stats(d,16) +from +( +select + avg(DISTINCT substr(src.value,5)) as a, + max(substr(src.value,5)) as b, + variance(substr(src.value,5)) as c, + var_samp(substr(src.value,5)) as d + from src)subq +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: value (type: string) + outputColumnNames: value + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: substr(value, 5) (type: string) + sort order: + + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Reduce Operator Tree: + Group By Operator + aggregations: avg(DISTINCT KEY._col0:0._col0), max(KEY._col0:0._col0), variance(KEY._col0:0._col0), var_samp(KEY._col0:0._col0) + mode: complete + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 1 Data size: 108 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: compute_stats(_col0, 16), compute_stats(_col1, 16), compute_stats(_col2, 16), compute_stats(_col3, 16) + mode: complete + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 1 Data size: 108 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 108 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select compute_stats(a,16),compute_stats(b,16),compute_stats(c,16),compute_stats(d,16) +from +( +select + avg(DISTINCT substr(src.value,5)) as a, + max(substr(src.value,5)) as b, + variance(substr(src.value,5)) as c, + var_samp(substr(src.value,5)) as d + from src)subq +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: select compute_stats(a,16),compute_stats(b,16),compute_stats(c,16),compute_stats(d,16) +from +( +select + avg(DISTINCT substr(src.value,5)) as a, + max(substr(src.value,5)) as b, + variance(substr(src.value,5)) as c, + var_samp(substr(src.value,5)) as d + from src)subq +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +{"columntype":"Double","min":256.10355987055016,"max":256.10355987055016,"countnulls":0,"numdistinctvalues":1,"ndvbitvector":"{2}{1}{0}{2}{0}{1}{1}{1}{0}{0}{1}{1}{0}{2}{1}{0}"} {"columntype":"String","maxlength":2,"avglength":2.0,"countnulls":0,"numdistinctvalues":1,"ndvbitvector":"{1}{2}{0}{3}{6}{3}{0}{1}{1}{0}{0}{0}{0}{0}{0}{0}"} {"columntype":"Double","min":20428.07287599999,"max":20428.07287599999,"countnulls":0,"numdistinctvalues":1,"ndvbitvector":"{1}{4}{0}{0}{4}{3}{0}{1}{0}{0}{0}{0}{0}{0}{1}{2}"} {"columntype":"Double","min":20469.010897795582,"max":20469.010897795582,"countnulls":0,"numdistinctvalues":2,"ndvbitvector":"{2}{0}{2}{2}{0}{0}{2}{0}{0}{0}{0}{0}{1}{0}{0}{0}"} +PREHOOK: query: explain select compute_stats(a,16),compute_stats(b,16),compute_stats(c,16),compute_stats(d,16) +from +( +select + avg(DISTINCT substr(src.value,5)) as a, + max(substr(src.value,5)) as b, + variance(substr(src.value,5)) as c, + var_samp(substr(src.value,5)) as d + from src)subq +PREHOOK: type: QUERY +POSTHOOK: query: explain select compute_stats(a,16),compute_stats(b,16),compute_stats(c,16),compute_stats(d,16) +from +( +select + avg(DISTINCT substr(src.value,5)) as a, + max(substr(src.value,5)) as b, + variance(substr(src.value,5)) as c, + var_samp(substr(src.value,5)) as d + from src)subq +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: value (type: string) + outputColumnNames: value + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: substr(value, 5) (type: string) + sort order: + + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Reduce Operator Tree: + Group By Operator + aggregations: avg(DISTINCT KEY._col0:0._col0), max(KEY._col0:0._col0), variance(KEY._col0:0._col0), var_samp(KEY._col0:0._col0) + mode: complete + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 1 Data size: 108 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + + Stage: Stage-2 + Map Reduce + Map Operator Tree: + TableScan + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 108 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: double), _col1 (type: string), _col2 (type: double), _col3 (type: double) + Reduce Operator Tree: + Group By Operator + aggregations: compute_stats(VALUE._col0, 16), compute_stats(VALUE._col2, 16), compute_stats(VALUE._col3, 16), compute_stats(VALUE._col4, 16) + mode: complete + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 1 Data size: 108 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 108 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select compute_stats(a,16),compute_stats(b,16),compute_stats(c,16),compute_stats(d,16) +from +( +select + avg(DISTINCT substr(src.value,5)) as a, + max(substr(src.value,5)) as b, + variance(substr(src.value,5)) as c, + var_samp(substr(src.value,5)) as d + from src)subq +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: select compute_stats(a,16),compute_stats(b,16),compute_stats(c,16),compute_stats(d,16) +from +( +select + avg(DISTINCT substr(src.value,5)) as a, + max(substr(src.value,5)) as b, + variance(substr(src.value,5)) as c, + var_samp(substr(src.value,5)) as d + from src)subq +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +{"columntype":"Double","min":256.10355987055016,"max":256.10355987055016,"countnulls":0,"numdistinctvalues":1,"ndvbitvector":"{2}{1}{0}{2}{0}{1}{1}{1}{0}{0}{1}{1}{0}{2}{1}{0}"} {"columntype":"String","maxlength":2,"avglength":2.0,"countnulls":0,"numdistinctvalues":1,"ndvbitvector":"{1}{2}{0}{3}{6}{3}{0}{1}{1}{0}{0}{0}{0}{0}{0}{0}"} {"columntype":"Double","min":20428.07287599999,"max":20428.07287599999,"countnulls":0,"numdistinctvalues":1,"ndvbitvector":"{1}{4}{0}{0}{4}{3}{0}{1}{0}{0}{0}{0}{0}{0}{1}{2}"} {"columntype":"Double","min":20469.010897795582,"max":20469.010897795582,"countnulls":0,"numdistinctvalues":2,"ndvbitvector":"{2}{0}{2}{2}{0}{0}{2}{0}{0}{0}{0}{0}{1}{0}{0}{0}"}