diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java index 17510e9..32d1de1 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java @@ -178,6 +178,10 @@ public MetaDataProcessor (ParseContext pctx) { abstract Object cast(double doubleValue); } + enum GbyKeyType { + NULL, CONSTANT, OTHER + } + private StatType getType(String origType) { if (serdeConstants.IntegralTypes.contains(origType)) { return StatType.Integeral; @@ -212,23 +216,23 @@ private Long getNullcountFor(StatType type, ColumnStatisticsData statData) { } } - private boolean hasNullOrConstantGbyKey(GroupByOperator gbyOp) { + private GbyKeyType getGbyKeyType(GroupByOperator gbyOp) { GroupByDesc gbyDesc = gbyOp.getConf(); int numCols = gbyDesc.getOutputColumnNames().size(); int aggCols = gbyDesc.getAggregators().size(); // If the Group by operator has null key if (numCols == aggCols) { - return true; + return GbyKeyType.NULL; } // If the Gby key is a constant List dpCols = gbyOp.getSchema().getColumnNames().subList(0, numCols - aggCols); for(String dpCol : dpCols) { ExprNodeDesc end = ExprNodeDescUtils.findConstantExprOrigin(dpCol, gbyOp); if (!(end instanceof ExprNodeConstantDesc)) { - return false; + return GbyKeyType.OTHER; } } - return true; + return GbyKeyType.CONSTANT; } @Override @@ -267,6 +271,16 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, // limit. In order to be safe, we do not use it now. return null; } + Table tbl = tsOp.getConf().getTableMetadata(); + if (AcidUtils.isAcidTable(tbl)) { + Logger.info("Table " + tbl.getTableName() + " is ACID table. Skip StatsOptimizer."); + return null; + } + Long rowCnt = getRowCnt(pctx, tsOp, tbl); + // if we can not have correct table stats, then both the table stats and column stats are not useful. + if (rowCnt == null) { + return null; + } SelectOperator pselOp = (SelectOperator)stack.get(1); for(ExprNodeDesc desc : pselOp.getConf().getColList()) { if (!((desc instanceof ExprNodeColumnDesc) || (desc instanceof ExprNodeConstantDesc))) { @@ -278,7 +292,12 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, // Since we have done an exact match on TS-SEL-GBY-RS-GBY-(SEL)-FS // we need not to do any instanceof checks for following. GroupByOperator pgbyOp = (GroupByOperator)stack.get(2); - if (!hasNullOrConstantGbyKey(pgbyOp)) { + if (getGbyKeyType(pgbyOp) == GbyKeyType.OTHER) { + return null; + } + // we already check if rowCnt is null and rowCnt==0 means table is + // empty. + else if (getGbyKeyType(pgbyOp) == GbyKeyType.CONSTANT && rowCnt == 0) { return null; } ReduceSinkOperator rsOp = (ReduceSinkOperator)stack.get(3); @@ -288,7 +307,12 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, } GroupByOperator cgbyOp = (GroupByOperator)stack.get(4); - if (!hasNullOrConstantGbyKey(cgbyOp)) { + if (getGbyKeyType(cgbyOp) == GbyKeyType.OTHER) { + return null; + } + // we already check if rowCnt is null and rowCnt==0 means table is + // empty. + else if (getGbyKeyType(cgbyOp) == GbyKeyType.CONSTANT && rowCnt == 0) { return null; } Operator last = (Operator) stack.get(5); @@ -318,11 +342,6 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, return null; // todo we can collapse this part of tree into single TS } - Table tbl = tsOp.getConf().getTableMetadata(); - if (AcidUtils.isAcidTable(tbl)) { - Logger.info("Table " + tbl.getTableName() + " is ACID table. Skip StatsOptimizer."); - return null; - } List oneRow = new ArrayList(); Hive hive = Hive.get(pctx.getConf()); @@ -350,10 +369,6 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, } else { return null; } - Long rowCnt = getRowCnt(pctx, tsOp, tbl); - if(rowCnt == null) { - return null; - } switch (category) { case LONG: oneRow.add(Long.valueOf(constant) * rowCnt); @@ -370,7 +385,7 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, } else if (udaf instanceof GenericUDAFCount) { // always long - Long rowCnt = 0L; + rowCnt = 0L; if (aggr.getParameters().isEmpty()) { // Its either count (*) or count() case rowCnt = getRowCnt(pctx, tsOp, tbl); diff --git a/ql/src/test/queries/clientpositive/constGby.q b/ql/src/test/queries/clientpositive/constGby.q new file mode 100644 index 0000000..bde2e79 --- /dev/null +++ b/ql/src/test/queries/clientpositive/constGby.q @@ -0,0 +1,21 @@ +set hive.mapred.mode=nonstrict; + +create table t1 (a int); +analyze table t1 compute statistics; +analyze table t1 compute statistics for columns; + +explain select count(1) from t1 group by 1; +select count(1) from t1 group by 1; +select count(1) from t1; +explain select count(*) from t1; +select count(*) from t1; +select count(1) from t1 group by 1=1; +select count(1), max(a) from t1 group by 1=1; + +set hive.compute.query.using.stats=false; + +select count(1) from t1 group by 1; +select count(1) from t1; +select count(*) from t1; +select count(1) from t1 group by 1=1; +select count(1), max(a) from t1 group by 1=1; diff --git a/ql/src/test/results/clientpositive/constGby.q.out b/ql/src/test/results/clientpositive/constGby.q.out new file mode 100644 index 0000000..fd8ecc2 --- /dev/null +++ b/ql/src/test/results/clientpositive/constGby.q.out @@ -0,0 +1,176 @@ +PREHOOK: query: create table t1 (a int) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@t1 +POSTHOOK: query: create table t1 (a int) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@t1 +PREHOOK: query: analyze table t1 compute statistics +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +PREHOOK: Output: default@t1 +POSTHOOK: query: analyze table t1 compute statistics +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +POSTHOOK: Output: default@t1 +PREHOOK: query: analyze table t1 compute statistics for columns +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +#### A masked pattern was here #### +POSTHOOK: query: analyze table t1 compute statistics for columns +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +#### A masked pattern was here #### +PREHOOK: query: explain select count(1) from t1 group by 1 +PREHOOK: type: QUERY +POSTHOOK: query: explain select count(1) from t1 group by 1 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: t1 + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: COMPLETE + Select Operator + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: COMPLETE + Group By Operator + aggregations: count(1) + keys: 1 (type: int) + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: int) + sort order: + + Map-reduce partition columns: _col0 (type: int) + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: bigint) + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + keys: KEY._col0 (type: int) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: _col1 (type: bigint) + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select count(1) from t1 group by 1 +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +#### A masked pattern was here #### +POSTHOOK: query: select count(1) from t1 group by 1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +#### A masked pattern was here #### +PREHOOK: query: select count(1) from t1 +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +#### A masked pattern was here #### +POSTHOOK: query: select count(1) from t1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +#### A masked pattern was here #### +0 +PREHOOK: query: explain select count(*) from t1 +PREHOOK: type: QUERY +POSTHOOK: query: explain select count(*) from t1 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-0 + Fetch Operator + limit: 1 + Processor Tree: + ListSink + +PREHOOK: query: select count(*) from t1 +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +#### A masked pattern was here #### +POSTHOOK: query: select count(*) from t1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +#### A masked pattern was here #### +0 +PREHOOK: query: select count(1) from t1 group by 1=1 +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +#### A masked pattern was here #### +POSTHOOK: query: select count(1) from t1 group by 1=1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +#### A masked pattern was here #### +PREHOOK: query: select count(1), max(a) from t1 group by 1=1 +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +#### A masked pattern was here #### +POSTHOOK: query: select count(1), max(a) from t1 group by 1=1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +#### A masked pattern was here #### +PREHOOK: query: select count(1) from t1 group by 1 +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +#### A masked pattern was here #### +POSTHOOK: query: select count(1) from t1 group by 1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +#### A masked pattern was here #### +PREHOOK: query: select count(1) from t1 +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +#### A masked pattern was here #### +POSTHOOK: query: select count(1) from t1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +#### A masked pattern was here #### +0 +PREHOOK: query: select count(*) from t1 +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +#### A masked pattern was here #### +POSTHOOK: query: select count(*) from t1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +#### A masked pattern was here #### +0 +PREHOOK: query: select count(1) from t1 group by 1=1 +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +#### A masked pattern was here #### +POSTHOOK: query: select count(1) from t1 group by 1=1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +#### A masked pattern was here #### +PREHOOK: query: select count(1), max(a) from t1 group by 1=1 +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +#### A masked pattern was here #### +POSTHOOK: query: select count(1), max(a) from t1 group by 1=1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +#### A masked pattern was here ####