diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java index c4fc5ca..320dc10 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java @@ -24,6 +24,7 @@ import java.util.List; import java.util.Map; import java.util.Map.Entry; +import java.util.Set; import java.util.Stack; import org.apache.hadoop.hive.conf.HiveConf; @@ -53,6 +54,7 @@ import org.apache.hadoop.hive.ql.plan.ExprNodeColumnListDesc; import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc; import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; +import org.apache.hadoop.hive.ql.plan.ExprNodeDesc.ExprNodeDescEqualityWrapper; import org.apache.hadoop.hive.ql.plan.ExprNodeDescUtils; import org.apache.hadoop.hive.ql.plan.ExprNodeDynamicListDesc; import org.apache.hadoop.hive.ql.plan.ExprNodeFieldDesc; @@ -64,6 +66,7 @@ import org.apache.hadoop.hive.ql.stats.StatsUtils; import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator; import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDFIn; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPAnd; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqual; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqualNS; @@ -76,19 +79,24 @@ import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPNotNull; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPNull; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPOr; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDFStruct; import org.apache.hadoop.hive.serde.serdeConstants; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; +import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.google.common.collect.Lists; import com.google.common.collect.Maps; +import com.google.common.collect.Sets; public class StatsRulesProcFactory { private static final Logger LOG = LoggerFactory.getLogger(StatsRulesProcFactory.class.getName()); private static final boolean isDebugEnabled = LOG.isDebugEnabled(); + /** * Collect basic statistics like number of rows, data size and column level statistics from the * table. Also sets the state of the available statistics. Basic and column statistics can have @@ -299,7 +307,7 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, private long evaluateExpression(Statistics stats, ExprNodeDesc pred, AnnotateStatsProcCtx aspCtx, List neededCols, - FilterOperator fop, long evaluatedRowCount) throws CloneNotSupportedException { + FilterOperator fop, long evaluatedRowCount) throws CloneNotSupportedException, SemanticException { long newNumRows = 0; Statistics andStats = null; @@ -338,6 +346,9 @@ private long evaluateExpression(Statistics stats, ExprNodeDesc pred, evaluatedRowCount = newNumRows; } } + } else if (udf instanceof GenericUDFIn) { + // for IN clause + newNumRows = evaluateInExpr(stats, pred, aspCtx, neededCols, fop); } else if (udf instanceof GenericUDFOPNot) { newNumRows = evaluateNotExpr(stats, pred, aspCtx, neededCols, fop); } else if (udf instanceof GenericUDFOPNotNull) { @@ -375,9 +386,97 @@ private long evaluateExpression(Statistics stats, ExprNodeDesc pred, return newNumRows; } + private long evaluateInExpr(Statistics stats, ExprNodeDesc pred, AnnotateStatsProcCtx aspCtx, + List neededCols, FilterOperator fop) throws SemanticException { + + long numRows = stats.getNumRows(); + + ExprNodeGenericFuncDesc fd = (ExprNodeGenericFuncDesc) pred; + + // 1. It is an IN operator, check if it uses STRUCT + List children = fd.getChildren(); + List columns = Lists.newArrayList(); + List columnStats = Lists.newArrayList(); + List> values = Lists.newArrayList(); + ExprNodeDesc columnsChild = children.get(0); + boolean multiColumn; + if (columnsChild instanceof ExprNodeGenericFuncDesc && + ((ExprNodeGenericFuncDesc) columnsChild).getGenericUDF() instanceof GenericUDFStruct) { + for (int j = 0; j < columnsChild.getChildren().size(); j++) { + ExprNodeDesc columnChild = columnsChild.getChildren().get(j); + // If column is not column reference , we bail out + if (!(columnChild instanceof ExprNodeColumnDesc)) { + // Default + return numRows / 2; + } + columns.add(columnChild); + final String columnName = ((ExprNodeColumnDesc)columnChild).getColumn(); + // if column name is not contained in needed column list then it + // is a partition column. We do not need to evaluate partition columns + // in filter expression since it will be taken care by partition pruner + if (neededCols != null && !neededCols.contains(columnName)) { + // Default + return numRows / 2; + } + columnStats.add(stats.getColumnStatisticsFromColName(columnName)); + values.add(Sets.newHashSet()); + } + multiColumn = true; + } else { + // If column is not column reference , we bail out + if (!(columnsChild instanceof ExprNodeColumnDesc)) { + // Default + return numRows / 2; + } + columns.add(columnsChild); + final String columnName = ((ExprNodeColumnDesc)columnsChild).getColumn(); + // if column name is not contained in needed column list then it + // is a partition column. We do not need to evaluate partition columns + // in filter expression since it will be taken care by partition pruner + if (neededCols != null && !neededCols.contains(columnName)) { + // Default + return numRows / 2; + } + columnStats.add(stats.getColumnStatisticsFromColName(columnName)); + values.add(Sets.newHashSet()); + multiColumn = false; + } + + // 2. Extract columns and values + for (int i = 1; i < children.size(); i++) { + ExprNodeDesc child = children.get(i); + // If value is not a constant, we bail out + if (!(child instanceof ExprNodeConstantDesc)) { + // Default + return numRows / 2; + } + if (multiColumn) { + ExprNodeConstantDesc constantChild = (ExprNodeConstantDesc) child; + List items = (List) constantChild.getWritableObjectInspector().getWritableConstantValue(); + List structTypes = ((StructTypeInfo) constantChild.getTypeInfo()).getAllStructFieldTypeInfos(); + for (int j = 0; j < structTypes.size(); j++) { + ExprNodeConstantDesc constant = new ExprNodeConstantDesc(structTypes.get(j), items.get(j)); + values.get(j).add(new ExprNodeDescEqualityWrapper(constant)); + } + } else { + values.get(0).add(new ExprNodeDescEqualityWrapper(child)); + } + } + + // 3. Calculate IN selectivity + float factor = 1; + for (int i = 0; i < columnStats.size(); i++) { + long dvs = columnStats.get(i) == null ? 0 : columnStats.get(i).getCountDistint(); + // ( num of distinct vals for col / num of rows ) * num of distinct vals for col in IN clause + float columnFactor = dvs == 0 ? 0.5f : ((float)dvs / numRows) * values.get(i).size(); + factor *= columnFactor; + } + return Math.round( (double)numRows * factor); + } + private long evaluateNotExpr(Statistics stats, ExprNodeDesc pred, AnnotateStatsProcCtx aspCtx, List neededCols, FilterOperator fop) - throws CloneNotSupportedException { + throws CloneNotSupportedException, SemanticException { long numRows = stats.getNumRows(); @@ -676,7 +775,7 @@ private long evaluateComparator(Statistics stats, ExprNodeGenericFuncDesc genFun private long evaluateChildExpr(Statistics stats, ExprNodeDesc child, AnnotateStatsProcCtx aspCtx, List neededCols, - FilterOperator fop, long evaluatedRowCount) throws CloneNotSupportedException { + FilterOperator fop, long evaluatedRowCount) throws CloneNotSupportedException, SemanticException { long numRows = stats.getNumRows(); @@ -761,7 +860,7 @@ private long evaluateChildExpr(Statistics stats, ExprNodeDesc child, } else if (udf instanceof GenericUDFOPNull) { return evaluateColEqualsNullExpr(stats, genFunc); } else if (udf instanceof GenericUDFOPAnd || udf instanceof GenericUDFOPOr - || udf instanceof GenericUDFOPNot) { + || udf instanceof GenericUDFIn || udf instanceof GenericUDFOPNot) { return evaluateExpression(stats, genFunc, aspCtx, neededCols, fop, evaluatedRowCount); } } diff --git ql/src/test/results/clientpositive/filter_cond_pushdown.q.out ql/src/test/results/clientpositive/filter_cond_pushdown.q.out index 738286e..9fa144d 100644 --- ql/src/test/results/clientpositive/filter_cond_pushdown.q.out +++ ql/src/test/results/clientpositive/filter_cond_pushdown.q.out @@ -442,14 +442,14 @@ STAGE PLANS: Statistics: Num rows: 550 Data size: 5843 Basic stats: COMPLETE Column stats: NONE Filter Operator predicate: (((_col1) IN ('2008-04-08', '2008-04-10') and (_col1) IN ('2008-04-08', '2008-04-09') and (_col3 = '2008-04-10')) or (_col3 = '2008-04-08')) (type: boolean) - Statistics: Num rows: 343 Data size: 3643 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 344 Data size: 3654 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: _col0 (type: string), _col1 (type: string), _col3 (type: string) outputColumnNames: _col0, _col1, _col2 - Statistics: Num rows: 343 Data size: 3643 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 344 Data size: 3654 Basic stats: COMPLETE Column stats: NONE File Output Operator compressed: false - Statistics: Num rows: 343 Data size: 3643 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 344 Data size: 3654 Basic stats: COMPLETE Column stats: NONE table: input format: org.apache.hadoop.mapred.SequenceFileInputFormat output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat diff --git ql/src/test/results/clientpositive/groupby_multi_single_reducer3.q.out ql/src/test/results/clientpositive/groupby_multi_single_reducer3.q.out index 5362390..c5488de 100644 --- ql/src/test/results/clientpositive/groupby_multi_single_reducer3.q.out +++ ql/src/test/results/clientpositive/groupby_multi_single_reducer3.q.out @@ -72,7 +72,7 @@ STAGE PLANS: Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE Filter Operator predicate: ((VALUE._col0) IN ('val_100', 'val_200', 'val_300') and (KEY._col0) IN (100, 150, 200)) (type: boolean) - Statistics: Num rows: 62 Data size: 658 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 63 Data size: 669 Basic stats: COMPLETE Column stats: NONE Group By Operator aggregations: count() keys: KEY._col0 (type: string) @@ -93,7 +93,7 @@ STAGE PLANS: name: default.e1 Filter Operator predicate: ((VALUE._col0) IN ('val_400', 'val_500') and (KEY._col0) IN (400, 450)) (type: boolean) - Statistics: Num rows: 62 Data size: 658 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 63 Data size: 669 Basic stats: COMPLETE Column stats: NONE Group By Operator aggregations: count() keys: KEY._col0 (type: string) @@ -404,7 +404,7 @@ STAGE PLANS: Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE Filter Operator predicate: ((VALUE._col0) IN ('val_100', 'val_200', 'val_300') and (KEY._col0) IN (100, 150, 200)) (type: boolean) - Statistics: Num rows: 62 Data size: 658 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 63 Data size: 669 Basic stats: COMPLETE Column stats: NONE Group By Operator aggregations: count() keys: KEY._col0 (type: string) @@ -425,7 +425,7 @@ STAGE PLANS: name: default.e1 Filter Operator predicate: ((VALUE._col0) IN ('val_400', 'val_500') and (KEY._col0) IN (400, 450)) (type: boolean) - Statistics: Num rows: 62 Data size: 658 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 63 Data size: 669 Basic stats: COMPLETE Column stats: NONE Group By Operator aggregations: count() keys: KEY._col0 (type: string) diff --git ql/src/test/results/clientpositive/pcs.q.out ql/src/test/results/clientpositive/pcs.q.out index a1382f1..d6d2431 100644 --- ql/src/test/results/clientpositive/pcs.q.out +++ ql/src/test/results/clientpositive/pcs.q.out @@ -921,17 +921,17 @@ STAGE PLANS: Filter Operator isSamplingPred: false predicate: (struct(_col2,_col0,_col8)) IN (const struct('2000-04-08',1,'2000-04-09'), const struct('2000-04-09',2,'2000-04-08')) (type: boolean) - Statistics: Num rows: 22 Data size: 176 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 6 Data size: 48 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: _col2 (type: string), _col6 (type: int) outputColumnNames: _col0, _col1 - Statistics: Num rows: 22 Data size: 176 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 6 Data size: 48 Basic stats: COMPLETE Column stats: NONE File Output Operator compressed: false GlobalTableId: 0 #### A masked pattern was here #### NumFilesPerFileSink: 1 - Statistics: Num rows: 22 Data size: 176 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 6 Data size: 48 Basic stats: COMPLETE Column stats: NONE #### A masked pattern was here #### table: input format: org.apache.hadoop.mapred.SequenceFileInputFormat diff --git ql/src/test/results/clientpositive/pointlookup.q.out ql/src/test/results/clientpositive/pointlookup.q.out index 460cc74..78dd7bc 100644 --- ql/src/test/results/clientpositive/pointlookup.q.out +++ ql/src/test/results/clientpositive/pointlookup.q.out @@ -111,14 +111,14 @@ STAGE PLANS: Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE Filter Operator predicate: (struct(key,value)) IN (const struct('0','8'), const struct('1','5'), const struct('2','6'), const struct('3','8'), const struct('4','1'), const struct('5','6'), const struct('6','1'), const struct('7','1'), const struct('8','1'), const struct('9','1'), const struct('10','3')) (type: boolean) - Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 125 Data size: 1328 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: key (type: string) outputColumnNames: _col0 - Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 125 Data size: 1328 Basic stats: COMPLETE Column stats: NONE File Output Operator compressed: false - Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 125 Data size: 1328 Basic stats: COMPLETE Column stats: NONE table: input format: org.apache.hadoop.mapred.SequenceFileInputFormat output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat @@ -177,14 +177,14 @@ STAGE PLANS: Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE Filter Operator predicate: (struct(key,value)) IN (const struct('0','8'), const struct('1','5'), const struct('2','6'), const struct('3','8'), const struct('4','1'), const struct('5','6'), const struct('6','1'), const struct('7','1'), const struct('8','1'), const struct('9','1'), const struct('10','3')) (type: boolean) - Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 125 Data size: 1328 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: key (type: string) outputColumnNames: _col0 - Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 125 Data size: 1328 Basic stats: COMPLETE Column stats: NONE File Output Operator compressed: false - Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 125 Data size: 1328 Basic stats: COMPLETE Column stats: NONE table: input format: org.apache.hadoop.mapred.SequenceFileInputFormat output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat diff --git ql/src/test/results/clientpositive/pointlookup2.q.out ql/src/test/results/clientpositive/pointlookup2.q.out index 869e4cd..6fc6e7f 100644 --- ql/src/test/results/clientpositive/pointlookup2.q.out +++ ql/src/test/results/clientpositive/pointlookup2.q.out @@ -1169,7 +1169,7 @@ STAGE PLANS: Filter Operator isSamplingPred: false predicate: (struct(_col2,_col4)) IN (const struct('2000-04-08',1), const struct('2000-04-09',2)) (type: boolean) - Statistics: Num rows: 22 Data size: 176 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 11 Data size: 88 Basic stats: COMPLETE Column stats: NONE File Output Operator compressed: false GlobalTableId: 0 @@ -1197,7 +1197,7 @@ STAGE PLANS: key expressions: _col4 (type: int), _col5 (type: string), _col2 (type: string) null sort order: aaa sort order: +++ - Statistics: Num rows: 22 Data size: 176 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 11 Data size: 88 Basic stats: COMPLETE Column stats: NONE tag: -1 value expressions: _col0 (type: int), _col1 (type: string), _col3 (type: string) auto parallelism: false @@ -1231,13 +1231,13 @@ STAGE PLANS: Select Operator expressions: VALUE._col0 (type: int), VALUE._col1 (type: string), KEY.reducesinkkey2 (type: string), VALUE._col2 (type: string), KEY.reducesinkkey0 (type: int), KEY.reducesinkkey1 (type: string) outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5 - Statistics: Num rows: 22 Data size: 176 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 11 Data size: 88 Basic stats: COMPLETE Column stats: NONE File Output Operator compressed: false GlobalTableId: 0 #### A masked pattern was here #### NumFilesPerFileSink: 1 - Statistics: Num rows: 22 Data size: 176 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 11 Data size: 88 Basic stats: COMPLETE Column stats: NONE #### A masked pattern was here #### table: input format: org.apache.hadoop.mapred.SequenceFileInputFormat @@ -1590,7 +1590,7 @@ STAGE PLANS: Filter Operator isSamplingPred: false predicate: (struct(_col0,_col3)) IN (const struct(1,'2000-04-08'), const struct(2,'2000-04-09')) (type: boolean) - Statistics: Num rows: 16 Data size: 128 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 8 Data size: 64 Basic stats: COMPLETE Column stats: NONE File Output Operator compressed: false GlobalTableId: 0 @@ -1618,7 +1618,7 @@ STAGE PLANS: key expressions: _col0 (type: int), _col1 (type: string), _col3 (type: string) null sort order: aaa sort order: +++ - Statistics: Num rows: 16 Data size: 128 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 8 Data size: 64 Basic stats: COMPLETE Column stats: NONE tag: -1 value expressions: _col2 (type: string), _col4 (type: int), _col5 (type: string) auto parallelism: false @@ -1652,13 +1652,13 @@ STAGE PLANS: Select Operator expressions: KEY.reducesinkkey0 (type: int), KEY.reducesinkkey1 (type: string), VALUE._col0 (type: string), KEY.reducesinkkey2 (type: string), VALUE._col1 (type: int), VALUE._col2 (type: string) outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5 - Statistics: Num rows: 16 Data size: 128 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 8 Data size: 64 Basic stats: COMPLETE Column stats: NONE File Output Operator compressed: false GlobalTableId: 0 #### A masked pattern was here #### NumFilesPerFileSink: 1 - Statistics: Num rows: 16 Data size: 128 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 8 Data size: 64 Basic stats: COMPLETE Column stats: NONE #### A masked pattern was here #### table: input format: org.apache.hadoop.mapred.SequenceFileInputFormat diff --git ql/src/test/results/clientpositive/pointlookup3.q.out ql/src/test/results/clientpositive/pointlookup3.q.out index e98ba76..2b25b39 100644 --- ql/src/test/results/clientpositive/pointlookup3.q.out +++ ql/src/test/results/clientpositive/pointlookup3.q.out @@ -1337,7 +1337,7 @@ STAGE PLANS: Filter Operator isSamplingPred: false predicate: (struct(_col2,_col4)) IN (const struct('2000-04-08',1), const struct('2000-04-09',2)) (type: boolean) - Statistics: Num rows: 22 Data size: 176 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 11 Data size: 88 Basic stats: COMPLETE Column stats: NONE File Output Operator compressed: false GlobalTableId: 0 @@ -1365,7 +1365,7 @@ STAGE PLANS: key expressions: _col4 (type: int), _col5 (type: string), _col2 (type: string) null sort order: aaa sort order: +++ - Statistics: Num rows: 22 Data size: 176 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 11 Data size: 88 Basic stats: COMPLETE Column stats: NONE tag: -1 value expressions: _col0 (type: int), _col1 (type: string), _col3 (type: string), _col6 (type: string), _col7 (type: string) auto parallelism: false @@ -1399,13 +1399,13 @@ STAGE PLANS: Select Operator expressions: VALUE._col0 (type: int), VALUE._col1 (type: string), KEY.reducesinkkey2 (type: string), VALUE._col2 (type: string), KEY.reducesinkkey0 (type: int), KEY.reducesinkkey1 (type: string), VALUE._col3 (type: string), VALUE._col4 (type: string) outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7 - Statistics: Num rows: 22 Data size: 176 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 11 Data size: 88 Basic stats: COMPLETE Column stats: NONE File Output Operator compressed: false GlobalTableId: 0 #### A masked pattern was here #### NumFilesPerFileSink: 1 - Statistics: Num rows: 22 Data size: 176 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 11 Data size: 88 Basic stats: COMPLETE Column stats: NONE #### A masked pattern was here #### table: input format: org.apache.hadoop.mapred.SequenceFileInputFormat