diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java index b9814f4..9620e62 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java @@ -401,7 +401,6 @@ private long evaluateColEqualsNullExpr(Statistics stats, ExprNodeDesc pred, long numRows = stats.getNumRows(); - // evaluate similar to "col = constant" expr if (pred instanceof ExprNodeGenericFuncDesc) { ExprNodeGenericFuncDesc genFunc = (ExprNodeGenericFuncDesc) pred; @@ -413,9 +412,7 @@ private long evaluateColEqualsNullExpr(Statistics stats, ExprNodeDesc pred, String tabAlias = colDesc.getTabAlias(); ColStatistics cs = stats.getColumnStatisticsForColumn(tabAlias, colName); if (cs != null) { - long dvs = cs.getCountDistint(); - numRows = dvs == 0 ? numRows / 2 : numRows / dvs; - return numRows; + return cs.getNumNulls(); } } } diff --git ql/src/test/results/clientpositive/annotate_stats_filter.q.out ql/src/test/results/clientpositive/annotate_stats_filter.q.out index 12f7604..7afed30 100644 --- ql/src/test/results/clientpositive/annotate_stats_filter.q.out +++ ql/src/test/results/clientpositive/annotate_stats_filter.q.out @@ -2051,17 +2051,17 @@ STAGE PLANS: Filter Operator isSamplingPred: false predicate: ((year = 2001) and year is null) (type: boolean) - Statistics: Num rows: 8 Data size: 804 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 1 Data size: 102 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: state (type: string), locid (type: int), zip (type: bigint), null (type: void) outputColumnNames: _col0, _col1, _col2, _col3 - Statistics: Num rows: 8 Data size: 804 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 1 Data size: 102 Basic stats: COMPLETE Column stats: COMPLETE File Output Operator compressed: false GlobalTableId: 0 #### A masked pattern was here #### NumFilesPerFileSink: 1 - Statistics: Num rows: 8 Data size: 804 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 1 Data size: 102 Basic stats: COMPLETE Column stats: COMPLETE #### A masked pattern was here #### table: input format: org.apache.hadoop.mapred.TextInputFormat @@ -2325,17 +2325,17 @@ STAGE PLANS: Filter Operator isSamplingPred: false predicate: (((year = 2001) and year is null) or (state = 'CA')) (type: boolean) - Statistics: Num rows: 8 Data size: 796 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 2 Data size: 204 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: state (type: string), locid (type: int), zip (type: bigint), year (type: int) outputColumnNames: _col0, _col1, _col2, _col3 - Statistics: Num rows: 8 Data size: 796 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 2 Data size: 204 Basic stats: COMPLETE Column stats: COMPLETE File Output Operator compressed: false GlobalTableId: 0 #### A masked pattern was here #### NumFilesPerFileSink: 1 - Statistics: Num rows: 8 Data size: 796 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 2 Data size: 204 Basic stats: COMPLETE Column stats: COMPLETE #### A masked pattern was here #### table: input format: org.apache.hadoop.mapred.TextInputFormat @@ -2463,17 +2463,17 @@ STAGE PLANS: Filter Operator isSamplingPred: false predicate: (((year = 2001) or year is null) and (state = 'CA')) (type: boolean) - Statistics: Num rows: 2 Data size: 204 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 1 Data size: 102 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: 'CA' (type: string), locid (type: int), zip (type: bigint), year (type: int) outputColumnNames: _col0, _col1, _col2, _col3 - Statistics: Num rows: 2 Data size: 204 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 1 Data size: 102 Basic stats: COMPLETE Column stats: COMPLETE File Output Operator compressed: false GlobalTableId: 0 #### A masked pattern was here #### NumFilesPerFileSink: 1 - Statistics: Num rows: 2 Data size: 204 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 1 Data size: 102 Basic stats: COMPLETE Column stats: COMPLETE #### A masked pattern was here #### table: input format: org.apache.hadoop.mapred.TextInputFormat diff --git ql/src/test/results/clientpositive/annotate_stats_join.q.out ql/src/test/results/clientpositive/annotate_stats_join.q.out index 43712ba..2a6348c 100644 --- ql/src/test/results/clientpositive/annotate_stats_join.q.out +++ ql/src/test/results/clientpositive/annotate_stats_join.q.out @@ -262,12 +262,12 @@ STAGE PLANS: Filter Operator isSamplingPred: false predicate: deptid is not null (type: boolean) - Statistics: Num rows: 4 Data size: 376 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 5 Data size: 471 Basic stats: COMPLETE Column stats: COMPLETE Reduce Output Operator key expressions: deptid (type: int) sort order: + Map-reduce partition columns: deptid (type: int) - Statistics: Num rows: 4 Data size: 376 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 5 Data size: 471 Basic stats: COMPLETE Column stats: COMPLETE tag: 0 value expressions: lastname (type: string) auto parallelism: false @@ -378,17 +378,17 @@ STAGE PLANS: 0 {VALUE._col0} {KEY.reducesinkkey0} 1 {KEY.reducesinkkey0} {VALUE._col0} outputColumnNames: _col0, _col1, _col4, _col5 - Statistics: Num rows: 8 Data size: 1464 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 10 Data size: 1830 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: _col0 (type: string), _col1 (type: int), _col4 (type: int), _col5 (type: string) outputColumnNames: _col0, _col1, _col2, _col3 - Statistics: Num rows: 8 Data size: 1464 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 10 Data size: 1830 Basic stats: COMPLETE Column stats: COMPLETE File Output Operator compressed: false GlobalTableId: 0 #### A masked pattern was here #### NumFilesPerFileSink: 1 - Statistics: Num rows: 8 Data size: 1464 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 10 Data size: 1830 Basic stats: COMPLETE Column stats: COMPLETE #### A masked pattern was here #### table: input format: org.apache.hadoop.mapred.TextInputFormat @@ -497,12 +497,12 @@ STAGE PLANS: Filter Operator isSamplingPred: false predicate: deptid is not null (type: boolean) - Statistics: Num rows: 4 Data size: 376 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 5 Data size: 471 Basic stats: COMPLETE Column stats: COMPLETE Reduce Output Operator key expressions: deptid (type: int) sort order: + Map-reduce partition columns: deptid (type: int) - Statistics: Num rows: 4 Data size: 376 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 5 Data size: 471 Basic stats: COMPLETE Column stats: COMPLETE tag: 2 value expressions: lastname (type: string) auto parallelism: false @@ -513,12 +513,12 @@ STAGE PLANS: Filter Operator isSamplingPred: false predicate: deptid is not null (type: boolean) - Statistics: Num rows: 4 Data size: 376 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 5 Data size: 471 Basic stats: COMPLETE Column stats: COMPLETE Reduce Output Operator key expressions: deptid (type: int) sort order: + Map-reduce partition columns: deptid (type: int) - Statistics: Num rows: 4 Data size: 376 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 5 Data size: 471 Basic stats: COMPLETE Column stats: COMPLETE tag: 0 value expressions: lastname (type: string) auto parallelism: false @@ -631,17 +631,17 @@ STAGE PLANS: 1 {KEY.reducesinkkey0} {VALUE._col0} 2 {VALUE._col0} {KEY.reducesinkkey0} outputColumnNames: _col0, _col1, _col4, _col5, _col8, _col9 - Statistics: Num rows: 32 Data size: 8768 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 50 Data size: 13700 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: _col0 (type: string), _col1 (type: int), _col4 (type: int), _col5 (type: string), _col8 (type: string), _col9 (type: int) outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5 - Statistics: Num rows: 32 Data size: 8768 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 50 Data size: 13700 Basic stats: COMPLETE Column stats: COMPLETE File Output Operator compressed: false GlobalTableId: 0 #### A masked pattern was here #### NumFilesPerFileSink: 1 - Statistics: Num rows: 32 Data size: 8768 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 50 Data size: 13700 Basic stats: COMPLETE Column stats: COMPLETE #### A masked pattern was here #### table: input format: org.apache.hadoop.mapred.TextInputFormat @@ -748,12 +748,12 @@ STAGE PLANS: Filter Operator isSamplingPred: false predicate: deptid is not null (type: boolean) - Statistics: Num rows: 4 Data size: 376 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 5 Data size: 471 Basic stats: COMPLETE Column stats: COMPLETE Reduce Output Operator key expressions: deptid (type: int) sort order: + Map-reduce partition columns: deptid (type: int) - Statistics: Num rows: 4 Data size: 376 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 5 Data size: 471 Basic stats: COMPLETE Column stats: COMPLETE tag: 0 value expressions: lastname (type: string) auto parallelism: false @@ -764,12 +764,12 @@ STAGE PLANS: Filter Operator isSamplingPred: false predicate: locid is not null (type: boolean) - Statistics: Num rows: 7 Data size: 702 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 8 Data size: 804 Basic stats: COMPLETE Column stats: COMPLETE Reduce Output Operator key expressions: locid (type: int) sort order: + Map-reduce partition columns: locid (type: int) - Statistics: Num rows: 7 Data size: 702 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 8 Data size: 804 Basic stats: COMPLETE Column stats: COMPLETE tag: 2 value expressions: state (type: string), zip (type: bigint), year (type: int) auto parallelism: false @@ -929,17 +929,17 @@ STAGE PLANS: 1 {KEY.reducesinkkey0} {VALUE._col0} 2 {VALUE._col0} {KEY.reducesinkkey0} {VALUE._col1} {VALUE._col2} outputColumnNames: _col0, _col1, _col4, _col5, _col8, _col9, _col10, _col11 - Statistics: Num rows: 56 Data size: 15724 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 80 Data size: 22468 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: _col0 (type: string), _col1 (type: int), _col4 (type: int), _col5 (type: string), _col8 (type: string), _col9 (type: int), _col10 (type: bigint), _col11 (type: int) outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7 - Statistics: Num rows: 56 Data size: 15724 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 80 Data size: 22468 Basic stats: COMPLETE Column stats: COMPLETE File Output Operator compressed: false GlobalTableId: 0 #### A masked pattern was here #### NumFilesPerFileSink: 1 - Statistics: Num rows: 56 Data size: 15724 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 80 Data size: 22468 Basic stats: COMPLETE Column stats: COMPLETE #### A masked pattern was here #### table: input format: org.apache.hadoop.mapred.TextInputFormat @@ -1343,12 +1343,12 @@ STAGE PLANS: Filter Operator isSamplingPred: false predicate: (deptid is not null and lastname is not null) (type: boolean) - Statistics: Num rows: 4 Data size: 376 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 5 Data size: 471 Basic stats: COMPLETE Column stats: COMPLETE Reduce Output Operator key expressions: deptid (type: int), lastname (type: string) sort order: ++ Map-reduce partition columns: deptid (type: int), lastname (type: string) - Statistics: Num rows: 4 Data size: 376 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 5 Data size: 471 Basic stats: COMPLETE Column stats: COMPLETE tag: 0 auto parallelism: false Path -> Alias: @@ -1458,17 +1458,17 @@ STAGE PLANS: 0 {KEY.reducesinkkey1} {KEY.reducesinkkey0} 1 {KEY.reducesinkkey0} {KEY.reducesinkkey1} outputColumnNames: _col0, _col1, _col4, _col5 - Statistics: Num rows: 4 Data size: 0 Basic stats: PARTIAL Column stats: COMPLETE + Statistics: Num rows: 5 Data size: 0 Basic stats: PARTIAL Column stats: COMPLETE Select Operator expressions: _col0 (type: string), _col1 (type: int), _col4 (type: int), _col5 (type: string) outputColumnNames: _col0, _col1, _col2, _col3 - Statistics: Num rows: 4 Data size: 0 Basic stats: PARTIAL Column stats: COMPLETE + Statistics: Num rows: 5 Data size: 0 Basic stats: PARTIAL Column stats: COMPLETE File Output Operator compressed: false GlobalTableId: 0 #### A masked pattern was here #### NumFilesPerFileSink: 1 - Statistics: Num rows: 4 Data size: 0 Basic stats: PARTIAL Column stats: COMPLETE + Statistics: Num rows: 5 Data size: 0 Basic stats: PARTIAL Column stats: COMPLETE #### A masked pattern was here #### table: input format: org.apache.hadoop.mapred.TextInputFormat @@ -1596,12 +1596,12 @@ STAGE PLANS: Filter Operator isSamplingPred: false predicate: (deptid is not null and lastname is not null) (type: boolean) - Statistics: Num rows: 4 Data size: 376 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 5 Data size: 471 Basic stats: COMPLETE Column stats: COMPLETE Reduce Output Operator key expressions: deptid (type: int), lastname (type: string) sort order: ++ Map-reduce partition columns: deptid (type: int), lastname (type: string) - Statistics: Num rows: 4 Data size: 376 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 5 Data size: 471 Basic stats: COMPLETE Column stats: COMPLETE tag: 0 auto parallelism: false TableScan @@ -1611,12 +1611,12 @@ STAGE PLANS: Filter Operator isSamplingPred: false predicate: (locid is not null and state is not null) (type: boolean) - Statistics: Num rows: 6 Data size: 600 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 8 Data size: 804 Basic stats: COMPLETE Column stats: COMPLETE Reduce Output Operator key expressions: locid (type: int), state (type: string) sort order: ++ Map-reduce partition columns: locid (type: int), state (type: string) - Statistics: Num rows: 6 Data size: 600 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 8 Data size: 804 Basic stats: COMPLETE Column stats: COMPLETE tag: 2 value expressions: zip (type: bigint), year (type: int) auto parallelism: false @@ -1776,17 +1776,17 @@ STAGE PLANS: 1 {KEY.reducesinkkey0} {KEY.reducesinkkey1} 2 {KEY.reducesinkkey1} {KEY.reducesinkkey0} {VALUE._col0} {VALUE._col1} outputColumnNames: _col0, _col1, _col4, _col5, _col8, _col9, _col10, _col11 - Statistics: Num rows: 24 Data size: 276 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 40 Data size: 468 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: _col0 (type: string), _col1 (type: int), _col4 (type: int), _col5 (type: string), _col8 (type: string), _col9 (type: int), _col10 (type: bigint), _col11 (type: int) outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7 - Statistics: Num rows: 24 Data size: 276 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 40 Data size: 468 Basic stats: COMPLETE Column stats: COMPLETE File Output Operator compressed: false GlobalTableId: 0 #### A masked pattern was here #### NumFilesPerFileSink: 1 - Statistics: Num rows: 24 Data size: 276 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 40 Data size: 468 Basic stats: COMPLETE Column stats: COMPLETE #### A masked pattern was here #### table: input format: org.apache.hadoop.mapred.TextInputFormat diff --git ql/src/test/results/clientpositive/metadataonly1.q.out ql/src/test/results/clientpositive/metadataonly1.q.out index a836e82..c1eeb71 100644 --- ql/src/test/results/clientpositive/metadataonly1.q.out +++ ql/src/test/results/clientpositive/metadataonly1.q.out @@ -711,11 +711,11 @@ STAGE PLANS: Filter Operator isSamplingPred: false predicate: _col0 is not null (type: boolean) - Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: COMPLETE + Statistics: Num rows: 1 Data size: 84 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: _col0 (type: string) outputColumnNames: _col0 - Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: COMPLETE + Statistics: Num rows: 1 Data size: 84 Basic stats: COMPLETE Column stats: COMPLETE File Output Operator compressed: false GlobalTableId: 0 @@ -743,7 +743,7 @@ STAGE PLANS: key expressions: _col0 (type: string) sort order: + Map-reduce partition columns: _col0 (type: string) - Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: COMPLETE + Statistics: Num rows: 1 Data size: 84 Basic stats: COMPLETE Column stats: COMPLETE tag: 1 auto parallelism: false TableScan @@ -872,9 +872,9 @@ STAGE PLANS: condition expressions: 0 1 - Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Statistics: Num rows: 1 Data size: 92 Basic stats: COMPLETE Column stats: NONE Select Operator - Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Statistics: Num rows: 1 Data size: 92 Basic stats: COMPLETE Column stats: NONE Group By Operator aggregations: count() mode: hash diff --git ql/src/test/results/clientpositive/tez/metadataonly1.q.out ql/src/test/results/clientpositive/tez/metadataonly1.q.out index cce9774..26dfc34 100644 --- ql/src/test/results/clientpositive/tez/metadataonly1.q.out +++ ql/src/test/results/clientpositive/tez/metadataonly1.q.out @@ -839,16 +839,16 @@ STAGE PLANS: Filter Operator isSamplingPred: false predicate: _col0 is not null (type: boolean) - Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: COMPLETE + Statistics: Num rows: 1 Data size: 84 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: _col0 (type: string) outputColumnNames: _col0 - Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: COMPLETE + Statistics: Num rows: 1 Data size: 84 Basic stats: COMPLETE Column stats: COMPLETE Reduce Output Operator key expressions: _col0 (type: string) sort order: + Map-reduce partition columns: _col0 (type: string) - Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: COMPLETE + Statistics: Num rows: 1 Data size: 84 Basic stats: COMPLETE Column stats: COMPLETE tag: 1 auto parallelism: true Reducer 3 @@ -860,9 +860,9 @@ STAGE PLANS: condition expressions: 0 1 - Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Statistics: Num rows: 1 Data size: 92 Basic stats: COMPLETE Column stats: NONE Select Operator - Statistics: Num rows: 0 Data size: 0 Basic stats: NONE Column stats: NONE + Statistics: Num rows: 1 Data size: 92 Basic stats: COMPLETE Column stats: NONE Group By Operator aggregations: count() mode: hash