diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveReduceExpressionsWithStatsRule.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveReduceExpressionsWithStatsRule.java index 6e900b3bfe..8842aa64b0 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveReduceExpressionsWithStatsRule.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveReduceExpressionsWithStatsRule.java @@ -41,7 +41,7 @@ import org.apache.hadoop.hive.ql.optimizer.calcite.RelOptHiveTable; import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveIn; import org.apache.hadoop.hive.ql.plan.ColStatistics; -import org.apache.hadoop.hive.ql.plan.ColStatistics.Range; +import org.apache.hadoop.hive.ql.stats.StatsUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -56,8 +56,11 @@ * we can infer that the predicate will evaluate to false if the max * value for column a is 4. * - * Currently we support the simplification of =, >=, <=, >, <, and - * IN operations. + * Currently we support the simplification of: + * - =, >=, <=, >, < + * - IN + * - ROW + * - IS_NULL / IS_NOT_NULL */ public class HiveReduceExpressionsWithStatsRule extends RelOptRule { @@ -243,9 +246,28 @@ public RexNode visitCall(RexCall call) { } return rexBuilder.makeCall(HiveIn.INSTANCE, newOperands); } - // We cannot apply the reduction return call; + } else if (call.getOperator().getKind() == SqlKind.IS_NULL || call.getOperator().getKind() == SqlKind.IS_NOT_NULL) { + SqlKind kind = call.getOperator().getKind(); + + if (call.operands.get(0) instanceof RexInputRef) { + RexInputRef ref = (RexInputRef) call.operands.get(0); + + ColStatistics stat = extractColStats(ref); + Long rowCount = extractRowCount(ref); + if (stat != null && rowCount != null) { + if (stat.getNumNulls() == 0 || stat.getNumNulls() == rowCount) { + boolean allNulls = (stat.getNumNulls() == rowCount); + + if (kind == SqlKind.IS_NULL) { + return rexBuilder.makeLiteral(allNulls); + } else { + return rexBuilder.makeLiteral(!allNulls); + } + } + } + } } // If we did not reduce, check the children nodes @@ -257,8 +279,18 @@ public RexNode visitCall(RexCall call) { } private Pair extractMaxMin(RexInputRef ref) { + + ColStatistics cs = extractColStats(ref); Number max = null; Number min = null; + if (cs != null && cs.getRange()!=null) { + max = cs.getRange().maxValue; + min = cs.getRange().minValue; + } + return Pair. of(max, min); + } + + private ColStatistics extractColStats(RexInputRef ref) { RelColumnOrigin columnOrigin = this.metadataProvider.getColumnOrigin(filterOp, ref.getIndex()); if (columnOrigin != null) { RelOptHiveTable table = (RelOptHiveTable) columnOrigin.getOriginTable(); @@ -267,15 +299,24 @@ public RexNode visitCall(RexCall call) { table.getColStat(Lists.newArrayList(columnOrigin.getOriginColumnOrdinal())).get(0); if (colStats != null && StatsSetupConst.areColumnStatsUptoDate( table.getHiveTableMD().getParameters(), colStats.getColumnName())) { - Range range = colStats.getRange(); - if (range != null) { - max = range.maxValue; - min = range.minValue; - } + return colStats; + } + } + } + return null; + } + + private Long extractRowCount(RexInputRef ref) { + RelColumnOrigin columnOrigin = this.metadataProvider.getColumnOrigin(filterOp, ref.getIndex()); + if (columnOrigin != null) { + RelOptHiveTable table = (RelOptHiveTable) columnOrigin.getOriginTable(); + if (table != null) { + if (StatsSetupConst.areBasicStatsUptoDate(table.getHiveTableMD().getParameters())) { + return StatsUtils.getNumRows(table.getHiveTableMD()); } } } - return Pair.of(max, min); + return null; } @SuppressWarnings("unchecked") diff --git ql/src/test/org/apache/hadoop/hive/ql/optimizer/calcite/rules/TestHiveReduceExpressionsWithStatsRule.java ql/src/test/org/apache/hadoop/hive/ql/optimizer/calcite/rules/TestHiveReduceExpressionsWithStatsRule.java new file mode 100644 index 0000000000..26cf3b7c20 --- /dev/null +++ ql/src/test/org/apache/hadoop/hive/ql/optimizer/calcite/rules/TestHiveReduceExpressionsWithStatsRule.java @@ -0,0 +1,203 @@ +package org.apache.hadoop.hive.ql.optimizer.calcite.rules; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotEquals; + +import java.util.HashMap; +import java.util.Map; + +import org.apache.calcite.jdbc.JavaTypeFactoryImpl; +import org.apache.calcite.plan.RelOptCluster; +import org.apache.calcite.plan.RelOptSchema; +import org.apache.calcite.plan.RelOptUtil; +import org.apache.calcite.plan.hep.HepPlanner; +import org.apache.calcite.plan.hep.HepProgramBuilder; +import org.apache.calcite.rel.RelNode; +import org.apache.calcite.rel.type.RelDataType; +import org.apache.calcite.rex.RexBuilder; +import org.apache.calcite.rex.RexLiteral; +import org.apache.calcite.sql.SqlKind; +import org.apache.calcite.sql.fun.SqlStdOperatorTable; +import org.apache.calcite.tools.RelBuilder; +import org.apache.hadoop.hive.common.StatsSetupConst; +import org.apache.hadoop.hive.ql.metadata.Table; +import org.apache.hadoop.hive.ql.optimizer.calcite.HiveRelFactories; +import org.apache.hadoop.hive.ql.optimizer.calcite.RelOptHiveTable; +import org.apache.hadoop.hive.ql.plan.ColStatistics; +import org.junit.Before; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.mockito.Matchers; +import org.mockito.Mock; +import org.mockito.Mockito; +import org.mockito.runners.MockitoJUnitRunner; + +import com.google.common.collect.Lists; + +@RunWith(MockitoJUnitRunner.class) +public class TestHiveReduceExpressionsWithStatsRule { + + @Mock + private RelOptSchema schemaMock; + @Mock + RelOptHiveTable tableMock; + @Mock + Table hiveTableMDMock; + + Map tableParams = new HashMap<>(); + + private HepPlanner planner; + private RelBuilder builder; + private ColStatistics statObj; + + private static class MyRecord { + public int _int; + public String _str; + } + + @Before + public void before() { + HepProgramBuilder programBuilder = new HepProgramBuilder(); + programBuilder.addRuleInstance(HiveReduceExpressionsWithStatsRule.INSTANCE); + + planner = new HepPlanner(programBuilder.build()); + + JavaTypeFactoryImpl typeFactory = new JavaTypeFactoryImpl(); + RexBuilder rexBuilder = new RexBuilder(typeFactory); + final RelOptCluster optCluster = RelOptCluster.create(planner, rexBuilder); + RelDataType rowTypeMock = typeFactory.createStructType(MyRecord.class); + Mockito.doReturn(rowTypeMock).when(tableMock).getRowType(); + Mockito.doReturn(tableMock).when(schemaMock).getTableForMember(Matchers.any()); + statObj = new ColStatistics("_int", "int"); + Mockito.doReturn(Lists.newArrayList(statObj)).when(tableMock).getColStat(Matchers.anyListOf(Integer.class)); + Mockito.doReturn(hiveTableMDMock).when(tableMock).getHiveTableMD(); + Mockito.doReturn(tableParams).when(hiveTableMDMock).getParameters(); + + builder = HiveRelFactories.HIVE_BUILDER.create(optCluster, schemaMock); + + StatsSetupConst.setStatsStateForCreateTable(tableParams, Lists.newArrayList("_int"), "TRUE"); + tableParams.put(StatsSetupConst.ROW_COUNT, "3"); + + } + + @Test + public void testGreaterThan_Below() { + + // @formatter:off + final RelNode basePlan = builder + .scan("t") + .filter( + builder.call(SqlStdOperatorTable.GREATER_THAN, + builder.field("_int"), builder.literal(0) + ) + ) + .build(); + // @formatter:on + + statObj.setRange(100, 200); + planner.setRoot(basePlan); + RelNode optimizedRelNode = planner.findBestExp(); + assertEquals("missing literal", SqlKind.LITERAL, optimizedRelNode.getChildExps().get(0).getKind()); + RexLiteral val = (RexLiteral) optimizedRelNode.getChildExps().get(0); + assertEquals(true, val.getValue()); + + } + + @Test + public void testIsNull_zero() { + + // @formatter:off + final RelNode basePlan = builder + .scan("t") + .filter( + builder.call(SqlStdOperatorTable.IS_NULL, + builder.field("_str") + ) + ) + .build(); + // @formatter:on + + statObj.setNumNulls(0); + planner.setRoot(basePlan); + System.out.println(RelOptUtil.toString(basePlan)); + RelNode optimizedRelNode = planner.findBestExp(); + System.out.println(RelOptUtil.toString(optimizedRelNode)); + assertEquals("missing literal", SqlKind.LITERAL, optimizedRelNode.getChildExps().get(0).getKind()); + RexLiteral val = (RexLiteral) optimizedRelNode.getChildExps().get(0); + assertEquals(false, val.getValue()); + + } + + @Test + public void testIsNull_one() { + + // @formatter:off + final RelNode basePlan = builder + .scan("t") + .filter( + builder.call(SqlStdOperatorTable.IS_NULL, + builder.field("_str") + ) + ) + .build(); + // @formatter:on + + statObj.setNumNulls(1); + planner.setRoot(basePlan); + System.out.println(RelOptUtil.toString(basePlan)); + RelNode optimizedRelNode = planner.findBestExp(); + System.out.println(RelOptUtil.toString(optimizedRelNode)); + assertNotEquals("should not be a literal", SqlKind.LITERAL, optimizedRelNode.getChildExps().get(0).getKind()); + } + + @Test + public void testIsNull_all() { + + // @formatter:off + final RelNode basePlan = builder + .scan("t") + .filter( + builder.call(SqlStdOperatorTable.IS_NULL, + builder.field("_str") + ) + ) + .build(); + // @formatter:on + + statObj.setNumNulls(3); + planner.setRoot(basePlan); + System.out.println(RelOptUtil.toString(basePlan)); + RelNode optimizedRelNode = planner.findBestExp(); + System.out.println(RelOptUtil.toString(optimizedRelNode)); + assertEquals("missing literal", SqlKind.LITERAL, optimizedRelNode.getChildExps().get(0).getKind()); + RexLiteral val = (RexLiteral) optimizedRelNode.getChildExps().get(0); + assertEquals(true, val.getValue()); + + } + + @Test + public void testIsNotNull() { + + // @formatter:off + final RelNode basePlan = builder + .scan("t") + .filter( + builder.call(SqlStdOperatorTable.IS_NOT_NULL, + builder.field("_str") + ) + ) + .build(); + // @formatter:on + + statObj.setNumNulls(0); + planner.setRoot(basePlan); + System.out.println(RelOptUtil.toString(basePlan)); + RelNode optimizedRelNode = planner.findBestExp(); + System.out.println(RelOptUtil.toString(optimizedRelNode)); + assertEquals("missing literal", SqlKind.LITERAL, optimizedRelNode.getChildExps().get(0).getKind()); + RexLiteral val = (RexLiteral) optimizedRelNode.getChildExps().get(0); + assertEquals(true, val.getValue()); + + } + +} diff --git ql/src/test/queries/clientpositive/remove_exprs_stats.q ql/src/test/queries/clientpositive/remove_exprs_stats.q index 66e6615047..741d455c33 100644 --- ql/src/test/queries/clientpositive/remove_exprs_stats.q +++ ql/src/test/queries/clientpositive/remove_exprs_stats.q @@ -53,3 +53,23 @@ explain select * from loc_orc where locid IN (5,2,3); explain select * from loc_orc where locid IN (1,6,9); -- always false explain select * from loc_orc where locid IN (40,30); + + + +create table t ( s string); +insert into t values (null),(null); +analyze table t compute statistics for columns s; + +-- true +explain select * from t where s is null; +explain select * from loc_orc where locid is not null; +-- false +explain select * from t where s is not null; +explain select * from loc_orc where locid is null; + +insert into t values ('val1'); +analyze table t compute statistics for columns s; + +-- untouched +explain select * from t where s is not null; +explain select * from t where s is null; diff --git ql/src/test/results/clientpositive/remove_exprs_stats.q.out ql/src/test/results/clientpositive/remove_exprs_stats.q.out index 4600e71f09..c256a5cab5 100644 --- ql/src/test/results/clientpositive/remove_exprs_stats.q.out +++ ql/src/test/results/clientpositive/remove_exprs_stats.q.out @@ -576,3 +576,227 @@ STAGE PLANS: Processor Tree: ListSink +PREHOOK: query: create table t ( s string) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@t +POSTHOOK: query: create table t ( s string) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@t +PREHOOK: query: insert into t values (null),(null) +PREHOOK: type: QUERY +PREHOOK: Output: default@t +POSTHOOK: query: insert into t values (null),(null) +POSTHOOK: type: QUERY +POSTHOOK: Output: default@t +POSTHOOK: Lineage: t.s SIMPLE [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +PREHOOK: query: analyze table t compute statistics for columns s +PREHOOK: type: QUERY +PREHOOK: Input: default@t +#### A masked pattern was here #### +POSTHOOK: query: analyze table t compute statistics for columns s +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t +#### A masked pattern was here #### +PREHOOK: query: explain select * from t where s is null +PREHOOK: type: QUERY +POSTHOOK: query: explain select * from t where s is null +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + TableScan + alias: t + Statistics: Num rows: 2 Data size: 84 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: s (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 2 Data size: 84 Basic stats: COMPLETE Column stats: COMPLETE + ListSink + +PREHOOK: query: explain select * from loc_orc where locid is not null +PREHOOK: type: QUERY +POSTHOOK: query: explain select * from loc_orc where locid is not null +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + TableScan + alias: loc_orc + Statistics: Num rows: 8 Data size: 816 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: state (type: string), locid (type: int), zip (type: bigint), year (type: int) + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 8 Data size: 816 Basic stats: COMPLETE Column stats: COMPLETE + ListSink + +PREHOOK: query: explain select * from t where s is not null +PREHOOK: type: QUERY +POSTHOOK: query: explain select * from t where s is not null +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: t + Statistics: Num rows: 2 Data size: 84 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: false (type: boolean) + Statistics: Num rows: 1 Data size: 84 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: s (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 84 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 84 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: explain select * from loc_orc where locid is null +PREHOOK: type: QUERY +POSTHOOK: query: explain select * from loc_orc where locid is null +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: loc_orc + Statistics: Num rows: 8 Data size: 816 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: false (type: boolean) + Statistics: Num rows: 1 Data size: 102 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: state (type: string), locid (type: int), zip (type: bigint), year (type: int) + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 1 Data size: 102 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 102 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: insert into t values ('val1') +PREHOOK: type: QUERY +PREHOOK: Output: default@t +POSTHOOK: query: insert into t values ('val1') +POSTHOOK: type: QUERY +POSTHOOK: Output: default@t +POSTHOOK: Lineage: t.s SIMPLE [(values__tmp__table__2)values__tmp__table__2.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +PREHOOK: query: analyze table t compute statistics for columns s +PREHOOK: type: QUERY +PREHOOK: Input: default@t +#### A masked pattern was here #### +POSTHOOK: query: analyze table t compute statistics for columns s +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t +#### A masked pattern was here #### +PREHOOK: query: explain select * from t where s is not null +PREHOOK: type: QUERY +POSTHOOK: query: explain select * from t where s is not null +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: t + Statistics: Num rows: 3 Data size: 170 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: s is not null (type: boolean) + Statistics: Num rows: 1 Data size: 85 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: s (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 85 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 85 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: explain select * from t where s is null +PREHOOK: type: QUERY +POSTHOOK: query: explain select * from t where s is null +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: t + Statistics: Num rows: 3 Data size: 170 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: s is null (type: boolean) + Statistics: Num rows: 2 Data size: 170 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: null (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 2 Data size: 84 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 2 Data size: 84 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink +