diff --git a/data/files/dept.txt b/data/files/dept.txt new file mode 100644 index 0000000..292bee6 --- /dev/null +++ b/data/files/dept.txt @@ -0,0 +1,4 @@ +31|sales +33|engineering +34|clerical +35|marketing diff --git a/data/files/emp.txt b/data/files/emp.txt new file mode 100755 index 0000000..8665e58 --- /dev/null +++ b/data/files/emp.txt @@ -0,0 +1,6 @@ +Rafferty|31 +Jones|33 +Steinberg|33 +Robinson|34 +Smith|34 +John| diff --git a/data/files/loc.txt b/data/files/loc.txt new file mode 100755 index 0000000..69910b7 --- /dev/null +++ b/data/files/loc.txt @@ -0,0 +1,8 @@ +OH|31|43201|2001 +IO|32|43202|2001 +CA|35|43809|2001 +FL|33|54342|2001 +UT|35||2001 +CA|35|43809|2001 +|34|40000| +FL|33|54342|2001 diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/CommonJoinOperator.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/CommonJoinOperator.java index d320b47..ffda8d2 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/CommonJoinOperator.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/CommonJoinOperator.java @@ -21,6 +21,7 @@ import java.io.Serializable; import java.util.ArrayList; import java.util.Arrays; +import java.util.Collections; import java.util.List; import java.util.Map; import java.util.Set; @@ -32,9 +33,15 @@ import org.apache.hadoop.hive.ql.exec.persistence.AbstractRowContainer; import org.apache.hadoop.hive.ql.exec.persistence.RowContainer; import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc; +import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; import org.apache.hadoop.hive.ql.plan.JoinCondDesc; import org.apache.hadoop.hive.ql.plan.JoinDesc; +import org.apache.hadoop.hive.ql.plan.OperatorDesc; +import org.apache.hadoop.hive.ql.plan.ReduceSinkDesc; +import org.apache.hadoop.hive.ql.plan.Statistics; import org.apache.hadoop.hive.ql.plan.TableDesc; +import org.apache.hadoop.hive.ql.stats.StatsUtils; import org.apache.hadoop.hive.serde2.io.ShortWritable; import org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; @@ -42,6 +49,9 @@ import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; +import com.google.common.collect.Lists; +import com.google.common.collect.Maps; + /** * Join operator implementation. */ @@ -825,4 +835,123 @@ public boolean opAllowedBeforeMapJoin() { public boolean opAllowedAfterMapJoin() { return false; } + + @Override + public Statistics getStatistics(HiveConf hiveconf) throws HiveException { + // There are three cases + // 1: The values of join keys are disjoint in both relations in which case + // T(RXS) = 0 + // 2: Join key is primary key on relation R and foreign key on relation S + // in which case every tuple in S will have a tuple in R + // T(RXS) = T(S) + // 3: refer book + + // in general case T(RXS) = (T(R)*T(S))/max(V(R,Y), V(S,Y)) where Y is the + // join attribute + Statistics stats = conf.getStatistics(); + if(stats == null) { + stats = super.getStatistics(hiveconf); + Map> outColExprMap = StatsUtils.getOuputColumnExprMap(this, stats); + Map tableAliasMap = StatsUtils.getTableAliasMap(this, stats); + // The column expression map at this point will have mapping between + // table name, column name and internal mapping. There can be multiple + // table names and internal mapping for each table columns will start from + // _col0. We need to update the internal column mapping to monotonically + // increasing column numbers. + + // Example: + // Input = {tabName1 => {_col0 => column1, _col1 => column2}, tabName2 => {_col0 => attr1, _col1 => attr2}} + // Output = {tabName1 => {_col0 => column1, _col1 => column2}, tabName2 => {_col2 => attr1, _col3 => attr2}} + updateOutputColumnExprMap(this, outColExprMap, tableAliasMap); + + // get the join keys from parent ReduceSink operators + long prodRows = 1; + List distinctVals = Lists.newArrayList(); + boolean multiAttr = false; + for (int pos = 0; pos < this.getParentOperators().size(); pos++) { + ReduceSinkOperator parent = (ReduceSinkOperator) this.getParentOperators().get(pos); + ReduceSinkDesc rsconf = parent.getConf(); + List keys = rsconf.getKeyCols(); + List dvs = Lists.newArrayList(); + if(keys.size() > 1) { + multiAttr = true; + } + for(ExprNodeDesc end : keys) { + if(end instanceof ExprNodeColumnDesc) { + ExprNodeColumnDesc encd = (ExprNodeColumnDesc) end; + + // FIXME: fix for multiple tables + Statistics parentStats = parent.getStatistics(hiveconf); + long dv = StatsUtils.getDistinctCountOfColumnFromTable(parentStats, parentStats.getDbName(), parentStats.getTableName(), encd.getColumn()); + dvs.add(dv); + } + } + long maxDV = Collections.max(dvs); + prodRows = prodRows * parent.getStatistics(hiveconf).getNumRows(); + distinctVals.add(maxDV); + } + + long denom = 1; + if(multiAttr) { + for(Long dv : distinctVals) { + denom = denom * dv; + } + } else { + denom = Collections.max(distinctVals); + } + long newRowCount = prodRows / denom; + StatsUtils.updateStats(stats, newRowCount); + } + return super.getStatistics(hiveconf); + } + + private void updateOutputColumnExprMap(CommonJoinOperator commonJoinOperator, + Map> outColExprMap, Map tableAliasMap) { + Map> exprMap = commonJoinOperator.getConf().getExprs(); + Map revExprMap = commonJoinOperator.getConf().getReversedExprs(); + Map> posToAliasMap = commonJoinOperator.getPosToAliasMap(); + List outColNames = commonJoinOperator.getConf().getOutputColumnNames(); + // mapping between out column name to input column names. Both output and + // input column names are internal names. + Map outColToInColMap = Maps.newHashMap(); + + int prevTablePos = -1; + int idx = 0; + for(String outColName : outColNames) { + int currTablePos = revExprMap.get(outColName); + if(prevTablePos == -1) { + prevTablePos = currTablePos; + } + + // new table + if(prevTablePos != currTablePos) { + idx = 0; + prevTablePos = currTablePos; + } + + outColToInColMap.put(outColName, "_col" + idx); + idx++; + } + + + for(Map.Entry entry : revExprMap.entrySet()) { + String col = entry.getKey(); + Byte pos = entry.getValue(); + Set aliases = posToAliasMap.get(pos.intValue()); + for(String alias : aliases) { + String tabName = tableAliasMap.get(alias); + + Map cem = outColExprMap.get(tabName); + + if(!cem.containsKey(col)) { + String mappedKey = outColToInColMap.get(col); + if(cem.containsKey(mappedKey)) { + cem.put(col, cem.get(mappedKey)); + cem.remove(mappedKey); + outColExprMap.put(tabName, cem); + } + } + } + } + } } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/FilterOperator.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/FilterOperator.java index 516ba42..95731c8 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/FilterOperator.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/FilterOperator.java @@ -24,8 +24,27 @@ import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.ql.io.IOContext; import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc; +import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc; +import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; +import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc; import org.apache.hadoop.hive.ql.plan.FilterDesc; +import org.apache.hadoop.hive.ql.plan.Statistics; import org.apache.hadoop.hive.ql.plan.api.OperatorType; +import org.apache.hadoop.hive.ql.stats.StatsUtils; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPAnd; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqual; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqualNS; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqualOrGreaterThan; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqualOrLessThan; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPGreaterThan; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPLessThan; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPNot; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPNotEqual; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPNotNull; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPNull; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPOr; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; import org.apache.hadoop.io.LongWritable; @@ -96,16 +115,16 @@ public void processOp(Object row, int tag) throws HiveException { if (conf.isSortedFilter() && ioContext.useSorted()) { if (!(conditionEvaluator instanceof ExprNodeGenericFuncEvaluator)) { LOG.error("Attempted to use the fact data is sorted when the conditionEvaluator is not " + - "of type ExprNodeGenericFuncEvaluator"); + "of type ExprNodeGenericFuncEvaluator"); ioContext.setUseSorted(false); return; } else { - ioContext.setComparison(((ExprNodeGenericFuncEvaluator)conditionEvaluator).compare(row)); + ioContext.setComparison(((ExprNodeGenericFuncEvaluator) conditionEvaluator).compare(row)); } if (ioContext.getGenericUDFClassName() == null) { ioContext.setGenericUDFClassName( - ((ExprNodeGenericFuncEvaluator)conditionEvaluator).genericUDF.getClass().getName()); + ((ExprNodeGenericFuncEvaluator) conditionEvaluator).genericUDF.getClass().getName()); } // If we are currently searching the data for a place to begin, do not return data yet @@ -123,8 +142,8 @@ public void processOp(Object row, int tag) throws HiveException { Object condition = conditionEvaluator.evaluate(row); // If we are currently performing a binary search on the input, don't forward the results - // Currently this value is set when a query is optimized using a compact index. The map reduce - // job responsible for scanning and filtering the index sets this value. It remains set + // Currently this value is set when a query is optimized using a compact index. The map reduce + // job responsible for scanning and filtering the index sets this value. It remains set // throughout the binary search executed by the HiveBinarySearchRecordResder until a starting // point for a linear scan has been identified, at which point this value is unset. if (ioContext.isBinarySearching()) { @@ -185,4 +204,173 @@ public boolean supportAutomaticSortMergeJoin() { public boolean supportUnionRemoveOptimization() { return true; } + + @Override + public Statistics getStatistics(HiveConf hiveconf) throws HiveException { + // Filter operator doesn't change the average row size but it does change + // the number of rows emitted. The reduction in the number of rows emitted + // is dependent on the filter expression. + + // Notations: + // T(S) - Number of tuples in relations S + // V(S,A) - Number of distinct values of attribute A in relation S + + // Rules: + // 1 - Column equals a constant + // T(S) = T(R) / V(R,A) + + // 2 - Inequality conditions + // T(S) = T(R) / 3 + + // 3 - Not equals comparison + // T(S) = T(R) + // (or) + // T(S) = T(R) (V(R,A) - 1) / V(R,A) + + // 4 - NOT condition + // T(S) = 1 - T(S'), where T(S') is the satisfying condition + + // 5 - Multiple AND conditions + // Cascadingly apply the rules 1 to 3 (order doesn't matter) + + // 6 - Multiple OR conditions + // Simple case is to evaluate conditions independently and sum the results + // T(S) = m1 + m2 + // (or) + // T(S) = T(R) * ( 1 - ( 1 - m1/T(R) ) * ( 1 - m2/T(R) )) + // where, m1 is the number of tuples that satisfy condition1 and + // m2 is the number of tuples that satisfy condition2 + + // For more information, refer 'Estimating The Cost Of Operations' chapter + // in "Database Systems: The Complete Book" by Garcia-Molina et. al. + Statistics stats = conf.getStatistics(); + if (stats == null) { + stats = super.getStatistics(hiveconf); + if (stats.getColStatsState().equals(Statistics.State.COMPLETE)) { + ExprNodeDesc root = conf.getPredicate(); + long newRowCount = evaluateExpression(stats, root); + StatsUtils.updateStats(stats, newRowCount); + } + } + return stats; + } + + private long evaluateExpression(Statistics stats, ExprNodeDesc pred) { + long newNumRows = 0; + Statistics andStats = null; + if (pred instanceof ExprNodeGenericFuncDesc) { + ExprNodeGenericFuncDesc genFunc = (ExprNodeGenericFuncDesc) pred; + GenericUDF udf = genFunc.getGenericUDF(); + if (udf instanceof GenericUDFOPAnd) { + // for AND condition cascadingly update stats + try { + andStats = stats.clone(); + stats.setAndExprStats(andStats); + } catch (CloneNotSupportedException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + for (ExprNodeDesc child : genFunc.getChildExprs()) { + newNumRows = evaluateChildExpr(stats.getAndExprStats(), child); + StatsUtils.updateStats(stats.getAndExprStats(), newNumRows); + } + } else { + if (udf instanceof GenericUDFOPOr) { + // for OR condition independently compute and update stats + for (ExprNodeDesc child : genFunc.getChildExprs()) { + newNumRows += evaluateChildExpr(stats, child); + if(stats.getAndExprStats() != null) { + newNumRows += stats.getAndExprStats().getNumRows(); + stats.setAndExprStats(null); + } + } + } else if (udf instanceof GenericUDFOPNot) { + // FIXME: + } else if (udf instanceof GenericUDFOPNotNull) { + newNumRows = evaluateColEqualsNullExpr(stats, pred); + newNumRows = stats.getNumRows() - newNumRows; + } else if (udf instanceof GenericUDFOPNull) { + newNumRows = evaluateColEqualsNullExpr(stats, pred); + } else { + // single predicate condition + newNumRows = evaluateChildExpr(stats, pred); + } + } + } + + return newNumRows; + } + + private long evaluateColEqualsNullExpr(Statistics stats, ExprNodeDesc pred) { + // evaluate similar to "col = constant" expr + if (pred instanceof ExprNodeGenericFuncDesc) { + ExprNodeGenericFuncDesc genFunc = (ExprNodeGenericFuncDesc) pred; + for (ExprNodeDesc leaf : genFunc.getChildExprs()) { + if (leaf instanceof ExprNodeColumnDesc) { + ExprNodeColumnDesc colDesc = (ExprNodeColumnDesc) leaf; + String colName = colDesc.getColumn(); + // FIXME: fix if this is an internal name + long dvs = StatsUtils.getDistinctCountOfColumnFromTable(stats, stats.getDbName(), stats.getTableName(), colName); + if (dvs != 0) { + return stats.getNumRows() / dvs; + } else { + // FIXME: how to deal with this case? + return stats.getNumRows(); + } + } + } + } + return 0; + } + + private long evaluateChildExpr(Statistics stats, ExprNodeDesc child) { + if (child instanceof ExprNodeGenericFuncDesc) { + ExprNodeGenericFuncDesc genFunc = (ExprNodeGenericFuncDesc) child; + GenericUDF udf = genFunc.getGenericUDF(); + if (udf instanceof GenericUDFOPEqual || udf instanceof GenericUDFOPEqualNS) { + String colName = null; + boolean isConst = false; + for (ExprNodeDesc leaf : genFunc.getChildExprs()) { + if (leaf instanceof ExprNodeConstantDesc) { + // if the first argument is const then just set the flag and continue + if (colName == null) { + isConst = true; + continue; + } + long dvs = StatsUtils.getDistinctCountOfColumnFromTable(stats, stats.getDbName(), stats.getTableName(), colName); + if (dvs != 0) { + return stats.getNumRows() / dvs; + } else { + // FIXME: how to deal with this case? + return stats.getNumRows(); + } + } else if (leaf instanceof ExprNodeColumnDesc) { + ExprNodeColumnDesc colDesc = (ExprNodeColumnDesc) leaf; + colName = colDesc.getColumn(); + // FIXME: fix if this is an internal name + + // if const is first argument then evaluate the result + if (isConst) { + long dvs = StatsUtils.getDistinctCountOfColumnFromTable(stats, stats.getDbName(), stats.getTableName(), colName); + if (dvs != 0) { + return stats.getNumRows() / dvs; + } else { + // FIXME: how to deal with this case? + return stats.getNumRows(); + } + } + } + } + } else if (udf instanceof GenericUDFOPNotEqual) { + return stats.getNumRows(); + } else if (udf instanceof GenericUDFOPEqualOrGreaterThan + || udf instanceof GenericUDFOPEqualOrLessThan || + udf instanceof GenericUDFOPGreaterThan || udf instanceof GenericUDFOPLessThan) { + return stats.getNumRows() / 3; + } else { + return evaluateExpression(stats, genFunc); + } + } + return 0; + } } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/GroupByOperator.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/GroupByOperator.java index 7d05982..e15bfc9 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/GroupByOperator.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/GroupByOperator.java @@ -46,7 +46,9 @@ import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; import org.apache.hadoop.hive.ql.plan.GroupByDesc; import org.apache.hadoop.hive.ql.plan.OperatorDesc; +import org.apache.hadoop.hive.ql.plan.Statistics; import org.apache.hadoop.hive.ql.plan.api.OperatorType; +import org.apache.hadoop.hive.ql.stats.StatsUtils; import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator; import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator.AggregationBuffer; import org.apache.hadoop.hive.serde2.lazy.ByteArrayRef; @@ -69,6 +71,9 @@ import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.Text; +import com.google.common.collect.Lists; +import com.google.common.collect.Maps; + /** * GroupBy operator implementation. */ @@ -1207,4 +1212,175 @@ public boolean acceptLimitPushdown() { return getConf().getMode() == GroupByDesc.Mode.MERGEPARTIAL || getConf().getMode() == GroupByDesc.Mode.COMPLETE; } + + @Override + public Statistics getStatistics(HiveConf hiveconf) throws HiveException { + // Group By operator changes the number of rows. The number of rows emitted + // by GBY operator will be atleast 1 or utmost T(R) i.e, number of rows in + // relation R. A better estimate can be found if we have column statistics + // on the columns that we are grouping on. + // Suppose if we are grouping by attributes A,B,C and if statistics for + // columns A,B,C are available then a better estimate can be found by taking + // the smaller of product of V(R,[A,B,C]) (product number of distincts of A,B,C) + // and T(R)/2. + // For more information, refer 'Estimating The Cost Of Operations' chapter + // in "Database Systems: The Complete Book" by Garcia-Molina et. al. + Statistics stats = this.getConf().getStatistics(); + long newNumRows = 0; + + if(stats == null) { + stats = super.getStatistics(hiveconf); + Map> outColExprMap = stats.getOutColExprMap(); + if(outColExprMap == null) { + outColExprMap = Maps.newHashMap(); + StatsUtils.getExtToIntColMappings(this, outColExprMap); + stats.setOutColExprMap(outColExprMap); + } + + long dvProd = 1; + for(String attr : getGroupByAttributes(stats, outColExprMap, conf.getKeys())) { + dvProd *= StatsUtils.getDistinctCountOfColumnFromTable(stats, stats.getDbName(), stats.getTableName(), attr); + } + + if(this.parentOperators.get(0) instanceof ReduceSinkOperator) { + // reducer side GBY with GROUPING_SETS will reduce the number of rows + // because of aggregation. For ex: GBY(A,B) WITH CUBE, mapper will emit + // 4 rows for each input row. The reducer side GBY will aggregate the + // rows and thereby decreasing the number of rows. The number of rows + // can be estimated as follows + // T(R) = min(T(R)/2, T(R, GBY(A,B)) + T(R, GBY(A)) + T(R, GBY(B)) + 1)) + dvProd = 1; + if(isParentGBYContainsGroupingSet(this)) { + List gs = getGroupingSet(this); + if(gs != null) { + List dims = getGroupByAttributes(stats, outColExprMap, conf.getKeys()); + List dvs = Lists.newArrayList(); + + for(String dim : dims) { + dvs.add(StatsUtils.getDistinctCountOfColumnFromTable(stats, stats.getDbName(), stats.getTableName(), dim)); + } + + for(Integer gsIdx : gs) { + dvProd += getRowCountsForGroupingSet(gsIdx, dvs); + // if the estimated num rows already exceed half the number of rows + // then we can exit early to avoid expensive computation + if(dvProd > (stats.getNumRows() / 2)) { + break; + } + } + } + } + + if(dvProd != 0) { + newNumRows = Math.min(stats.getNumRows() / 2, dvProd); + } else { + newNumRows = stats.getNumRows() / 2; + } + StatsUtils.updateStats(stats, newNumRows); + return stats; + } + + + // if some column stats are missing then distinct values product will be 0 + if(dvProd != 0) { + newNumRows = Math.min(stats.getNumRows() / 2, dvProd); + } else { + newNumRows = stats.getNumRows() / 2; + } + + // if grouping set is present then it means a CUBE/ROLLUP/GROUPING_SET + // operation. In any of these cases, each row will be duplicated by + // for size of the grouping sets. For example: GBY(A,B,C) WITH CUBE will + // have GROUPING_SET(0, 1, 2, 3, 4, 5, 6, 7) which implies that each row + // will be duplicated 8 times for different combinations of aggregations. + // The number of rows/data size will be increased by a factor of the + // number of elements in grouping set + if(this.getConf().isGroupingSetsPresent()) { + int multiplier = this.getConf().getListGroupingSets().size(); + newNumRows = multiplier * newNumRows; + } + StatsUtils.updateStats(stats, newNumRows); + } + return stats; + } + + private long getRowCountsForGroupingSet(Integer gsIdx, List dvs) { + int val = gsIdx.intValue(); + int idx = 0; + int size = dvs.size(); + List cloneDVS = Lists.newArrayList(); + cloneDVS.addAll(dvs); + while(val != 0) { + if((val & 1) == 1) { + // set in the reverse order + cloneDVS.set(size - idx - 1, 1L); + } + val = val >>> 1; + idx++; + } + + long result = 1; + for(Long l : cloneDVS) { + result *= l; + } + return result; + } + + private List getGroupByAttributes(Statistics stats, Map> outColExprMap, + ArrayList keys) { + List attrs = Lists.newArrayList(); + for(ExprNodeDesc end : keys) { + if(end instanceof ExprNodeColumnDesc) { + ExprNodeColumnDesc encd = (ExprNodeColumnDesc) end; + String colName = encd.getColumn(); + if((colName.startsWith("_col") || colName.startsWith("KEY._col")) && outColExprMap != null) { + if(colName.startsWith("KEY._col")) { + // strip off KEY. from column name + colName = colName.split("\\.")[1]; + } + ExprNodeDesc desc = outColExprMap.get(stats.getTableName()).get(colName); + if(desc instanceof ExprNodeColumnDesc) { + colName = ((ExprNodeColumnDesc)desc).getColumn(); + } + } + // if the colName still starts with _ then it might be constant projection + if(!colName.startsWith("_")) { + attrs.add(colName); + } + } + } + return attrs; + } + + private List getGroupingSet(GroupByOperator groupByOperator) { + Operator currOp = groupByOperator; + while (currOp != null) { + for (Operator op : currOp.getParentOperators()) { + if (op instanceof GroupByOperator) { + GroupByOperator gby = (GroupByOperator) op; + if (gby.getConf().isGroupingSetsPresent()) { + return gby.getConf().getListGroupingSets(); + } + } + currOp = op; + } + } + return null; + } + + private boolean isParentGBYContainsGroupingSet(GroupByOperator groupByOperator) { + Operator currOp = groupByOperator; + while (currOp != null) { + for (Operator op : currOp.getParentOperators()) { + if (op instanceof GroupByOperator) { + GroupByOperator gby = (GroupByOperator) op; + if (gby.getConf().isGroupingSetsPresent()) { + return true; + } + } + currOp = op; + } + } + return false; + } } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/LimitOperator.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/LimitOperator.java index 276902a..54880fa 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/LimitOperator.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/LimitOperator.java @@ -21,8 +21,10 @@ import java.io.Serializable; import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.plan.LimitDesc; +import org.apache.hadoop.hive.ql.plan.Statistics; import org.apache.hadoop.hive.ql.plan.api.OperatorType; /** @@ -76,4 +78,21 @@ public void closeOp(boolean abort) throws HiveException { } } + @Override + public Statistics getStatistics(HiveConf hiveconf) throws HiveException { + Statistics stats = this.getConf().getStatistics(); + if(stats == null) { + stats = super.getStatistics(hiveconf); + + int globalLimit = this.getConf().getLimit(); + // TODO: why? + if(globalLimit == -1) { + globalLimit = this.getConf().getLeastRows(); + } + long newRawDataSize = ((stats.getRawDataSize() * globalLimit) / stats.getNumRows()); + stats.setNumRows(globalLimit); + stats.setRawDataSize(newRawDataSize); + } + return stats; + } } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/MuxOperator.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/MuxOperator.java index d5989ef..08ad17d 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/MuxOperator.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/MuxOperator.java @@ -26,10 +26,12 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; import org.apache.hadoop.hive.ql.plan.MuxDesc; import org.apache.hadoop.hive.ql.plan.OperatorDesc; +import org.apache.hadoop.hive.ql.plan.Statistics; import org.apache.hadoop.hive.ql.plan.api.OperatorType; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; @@ -323,4 +325,17 @@ static public String getOperatorName() { public OperatorType getType() { return OperatorType.MUX; } + + @Override + public Statistics getStatistics(HiveConf hiveconf) throws HiveException { + // split the statistics equally between the children of MUX operator + int numChilds = this.childOperators.size(); + Statistics stats = conf.getStatistics(); + if(stats == null) { + stats = super.getStatistics(hiveconf); + stats.setNumRows(stats.getNumRows() * numChilds); + stats.setRawDataSize(stats.getRawDataSize() * numChilds); + } + return stats; + } } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/Operator.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/Operator.java index 9fc7afa..fab3990 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/Operator.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/Operator.java @@ -32,6 +32,7 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.ql.exec.mr.ExecMapperContext; import org.apache.hadoop.hive.ql.lib.Node; import org.apache.hadoop.hive.ql.metadata.HiveException; @@ -39,6 +40,7 @@ import org.apache.hadoop.hive.ql.plan.Explain; import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; import org.apache.hadoop.hive.ql.plan.OperatorDesc; +import org.apache.hadoop.hive.ql.plan.Statistics; import org.apache.hadoop.hive.ql.plan.api.OperatorType; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; @@ -1569,6 +1571,48 @@ public boolean acceptLimitPushdown() { return false; } + /** + * Computes and retrieves the stats for this operator. Default implementation + * just passes the statistics from its parent. If there are multiple parents + * then the statistics are accumulated/merge. If any of the parents have partial + * stats then the current operator will also have partial stats. + * @return Statistics for this operator + */ + public Statistics getStatistics(HiveConf hiveconf) throws HiveException { + Statistics stats = this.getConf().getStatistics(); + + if (stats == null) { + stats = new Statistics(); + int numParents = 0; + if(this.getParentOperators() != null) { + numParents = this.getParentOperators().size(); + } + + Statistics parentStats = this.getParentOperators().get(0).getStatistics(hiveconf); + stats.setDbName(parentStats.getDbName()); + stats.setTableName(parentStats.getTableName()); + stats.setAllColumns(parentStats.getAllColumns()); + stats.setSelectedColumns(parentStats.getSelectedColumns()); + stats.setBasicStatsState(parentStats.getBasicStatsState()); + stats.setLevel(parentStats.getLevel()); + stats.setColStatsState(parentStats.getColStatsState()); + stats.setNumRows(parentStats.getNumRows()); + stats.setRawDataSize(parentStats.getRawDataSize()); + stats.setTableColStats(parentStats.getTableColStats()); + stats.setPartitionColStats(parentStats.getPartitionColStats()); + stats.setAndExprStats(parentStats.getAndExprStats()); + stats.setOutColExprMap(parentStats.getOutColExprMap()); + + if(numParents > 1) { + for(int i = 1; i < numParents; i++) { + stats.merge(this.getParentOperators().get(i).getStatistics(hiveconf)); + } + } + this.getConf().setStatistics(stats); + } + return stats; + } + @Override public String toString() { return getName() + "[" + getIdentifier() + "]"; diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/PTFOperator.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/PTFOperator.java index a249d74..254b5cc 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/PTFOperator.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/PTFOperator.java @@ -31,6 +31,7 @@ import org.apache.hadoop.hive.ql.plan.OperatorDesc; import org.apache.hadoop.hive.ql.plan.PTFDesc; import org.apache.hadoop.hive.ql.plan.PTFDeserializer; +import org.apache.hadoop.hive.ql.plan.Statistics; import org.apache.hadoop.hive.ql.plan.api.OperatorType; import org.apache.hadoop.hive.ql.plan.ptf.PTFExpressionDef; import org.apache.hadoop.hive.ql.plan.ptf.PTFInputDef; @@ -289,4 +290,17 @@ public static void connectLeadLagFunctionsToPartition(PTFDesc ptfDesc, } } + @Override + public Statistics getStatistics(HiveConf hiveconf) throws HiveException { + // PTF operator doesn't change the input rows. However it adds new column + // for each aggregation. This new column addition can be handled here or + // delegated to the downstream SEL operator. Child of PTFOperator is Select + // operator. Select operator already handles column, constand and UDF + // projection. + // For now we will let the downstream select operator handle statistics for PTFOperator. + + // passthrough + return super.getStatistics(hiveconf); + } + } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/ReduceSinkOperator.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/ReduceSinkOperator.java index 615c854..393776a 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/ReduceSinkOperator.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/ReduceSinkOperator.java @@ -32,6 +32,7 @@ import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; import org.apache.hadoop.hive.ql.plan.ExprNodeDescUtils; import org.apache.hadoop.hive.ql.plan.ReduceSinkDesc; +import org.apache.hadoop.hive.ql.plan.Statistics; import org.apache.hadoop.hive.ql.plan.TableDesc; import org.apache.hadoop.hive.ql.plan.api.OperatorType; import org.apache.hadoop.hive.serde2.Serializer; @@ -43,8 +44,8 @@ import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.UnionObjectInspector; import org.apache.hadoop.io.BytesWritable; -import org.apache.hadoop.io.Writable; import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.Writable; /** * Reduce Sink Operator sends output to the reduce stage. @@ -431,4 +432,9 @@ public OperatorType getType() { public boolean opAllowedBeforeMapJoin() { return false; } + + @Override + public Statistics getStatistics(HiveConf hiveconf) throws HiveException { + return super.getStatistics(hiveconf); + } } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/SelectOperator.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/SelectOperator.java index 025bf9e..1c1447d 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/SelectOperator.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/SelectOperator.java @@ -20,15 +20,26 @@ import java.io.Serializable; import java.util.List; +import java.util.Map; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.metastore.api.ColumnStatistics; import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc; +import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc; import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; +import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc; import org.apache.hadoop.hive.ql.plan.SelectDesc; +import org.apache.hadoop.hive.ql.plan.Statistics; import org.apache.hadoop.hive.ql.plan.api.OperatorType; +import org.apache.hadoop.hive.ql.stats.StatsUtils; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; +import com.google.common.collect.Lists; +import com.google.common.collect.Maps; + /** * Select operator implementation. */ @@ -129,4 +140,159 @@ public boolean supportUnionRemoveOptimization() { public boolean acceptLimitPushdown() { return true; } + + @Override + public Statistics getStatistics(HiveConf hiveconf) throws HiveException { + // Projection doesn't change the number of rows emitted from the parent + // operator. It changes the size of each tuple emitted. In a typical case, + // where only subset of columns are selected the average row size will + // reduce as some of the columns will be pruned. In order to accurately + // compute the average row size, column level statistics is required. + // Column level statistics stores average size of values in column which + // can be used to more reliably estimate the reduction in size of each + // tuple. + + // If column level statistics are not available then use the standard fixed + // size for primitive data types. FIXME: How to handle variable length types + // in case if column stats are not available. + + // For more information, refer 'Estimating The Cost Of Operations' chapter + // in "Database Systems: The Complete Book" by Garcia-Molina et. al. + Statistics stats = conf.getStatistics(); + List colList = conf.getColList(); + boolean isConstPresent = checkForConstOrUDFProjection(colList); + + if (stats == null) { + stats = super.getStatistics(hiveconf); + if (conf.isSelectStar()) { +// if (conf.getColList().size() == 0) { +// stats.setNumRows(1); +// stats.setRawDataSize(StatsUtils.getSizeOfFixedLengthPrimitivesFromType("bigint")); +// } + return stats; + } + // save the mapping between column aliases to the internal aliases. + // This is required for computing the size of projected column with alias + // renaming. Ex: select col1 as col_a, col2 as col_b from table; + Map> outColExprMap = stats.getOutColExprMap(); + if(outColExprMap == null) { + outColExprMap = Maps.newHashMap(); + StatsUtils.getExtToIntColMappings(this, outColExprMap); + stats.setOutColExprMap(outColExprMap); + } + + if (!conf.isSelStarNoCompute() && stats.getBasicStatsState().equals(Statistics.State.COMPLETE)) { + List projColAliases = stats.getSelectedColumns(); + + boolean[] colSelections = new boolean[projColAliases.size()]; + LOG.info("Raw data size before SELECT operator: " + stats.getRawDataSize()); + + for (String col : projColAliases) { + String actualColName = col; + if(col.startsWith("_") && outColExprMap != null) { + ExprNodeDesc desc = outColExprMap.get(stats.getTableName()).get(col); + if(desc instanceof ExprNodeColumnDesc) { + actualColName = ((ExprNodeColumnDesc)desc).getColumn(); + } + } + int colIdx = stats.getSelectedColumns().indexOf(actualColName); + + // this could be a constant projection + if (colIdx == -1 && col.startsWith("_")) { + continue; + } + + colSelections[colIdx] = true; + } + + List selColStats = StatsUtils.getColumnStatsForSelectedColumns(stats, + colSelections); + if(selColStats == null) { + LOG.info("Column statistics not available"); + return stats; + } + long reducedRawDataSize = StatsUtils.getRawDataSizeForSelectedColumns(stats, selColStats); + LOG.info("Raw data size after SELECT operator: " + reducedRawDataSize); + + // constant projection adds to the data size + if (isConstPresent) { + reducedRawDataSize += getConstOrUDFProjectionSize(stats.getNumRows(), conf.getColList()); + } + stats.setRawDataSize(reducedRawDataSize); + } + } + return stats; + } + + private List getProjectedColumnAliases(List colList) { + List result = Lists.newArrayList(); + for(ExprNodeDesc end : colList) { + if(end instanceof ExprNodeColumnDesc) { + result.addAll(end.getCols()); + } + } + return result; + } + + private long getConstOrUDFProjectionSize(long numRows, List colList) { + long result = 0; + for (ExprNodeDesc end : colList) { + String colType = null; + ObjectInspector oi = null; + + if (end instanceof ExprNodeConstantDesc) { + ExprNodeConstantDesc encd = (ExprNodeConstantDesc) end; + colType = encd.getTypeString(); + oi = encd.getWritableObjectInspector(); + } else if (end instanceof ExprNodeGenericFuncDesc) { + ExprNodeGenericFuncDesc engfd = (ExprNodeGenericFuncDesc) end; + colType = engfd.getTypeString(); + oi = engfd.getWritableObjectInspector(); + } else if(end instanceof ExprNodeColumnDesc) { + ExprNodeColumnDesc encd = (ExprNodeColumnDesc) end; + // if its a virtual column + if(encd.getIsPartitionColOrVirtualCol()) { + colType = encd.getTypeInfo().getTypeName(); + oi = encd.getWritableObjectInspector(); + } else { + // else it is column projection + continue; + } + } else { + // may be an expression? FIXME: + continue; + } + + if (colType.equalsIgnoreCase("string") || + colType.equalsIgnoreCase("binary") || + colType.startsWith("array") || + colType.startsWith("map") || + colType.startsWith("struct") || + colType.startsWith("union")) { + // FIXME: do we need to multiply the number of rows here? if constants are + // immutable then there will be only one copy until a reduce or file sink + // reached. + result += numRows * StatsUtils.getSizeOfVariableLengthTypes(oi, colType); + } else { + result += numRows * StatsUtils.getSizeOfFixedLengthPrimitivesFromType(colType); + } + } + return result; + } + + private boolean checkForConstOrUDFProjection(List colList) { + for (ExprNodeDesc end : colList) { + if(end instanceof ExprNodeColumnDesc) { + ExprNodeColumnDesc encd = (ExprNodeColumnDesc) end; + // newly added virtual column + if(encd.getIsPartitionColOrVirtualCol()) { + return true; + } + } + if (end instanceof ExprNodeConstantDesc || end instanceof ExprNodeGenericFuncDesc) { + return true; + } + } + return false; + } } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/TableScanOperator.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/TableScanOperator.java index e538092..21d30ea 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/TableScanOperator.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/TableScanOperator.java @@ -27,15 +27,18 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.common.FileUtils; +import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.ql.ErrorMsg; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.metadata.VirtualColumn; import org.apache.hadoop.hive.ql.plan.OperatorDesc; +import org.apache.hadoop.hive.ql.plan.Statistics; import org.apache.hadoop.hive.ql.plan.TableDesc; import org.apache.hadoop.hive.ql.plan.TableScanDesc; import org.apache.hadoop.hive.ql.plan.api.OperatorType; import org.apache.hadoop.hive.ql.stats.StatsPublisher; import org.apache.hadoop.hive.ql.stats.StatsSetupConst; +import org.apache.hadoop.hive.ql.stats.StatsUtils; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.ObjectInspectorCopyOption; @@ -320,6 +323,17 @@ private void publishStats() throws HiveException { } @Override + public Statistics getStatistics(HiveConf hiveconf) throws HiveException { + Statistics stats = this.getConf().getStatistics(); + if (stats == null) { + stats = StatsUtils.collectStatistics(alias, getConf().getTable(), hiveconf, this, getConf() + .getPruningPredicate()); + this.getConf().setStatistics(stats); + } + return stats; + } + + @Override public boolean supportSkewJoinOptimization() { return true; } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/AnnotateStatsProc.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/AnnotateStatsProc.java new file mode 100644 index 0000000..053988d --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/AnnotateStatsProc.java @@ -0,0 +1,27 @@ +package org.apache.hadoop.hive.ql.optimizer; + +import java.util.Stack; + +import org.apache.hadoop.hive.ql.exec.FileSinkOperator; +import org.apache.hadoop.hive.ql.lib.Node; +import org.apache.hadoop.hive.ql.lib.NodeProcessor; +import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.parse.SemanticException; + +public class AnnotateStatsProc implements NodeProcessor { + + @Override + public Object + process(Node nd, Stack stack, NodeProcessorCtx procCtx, Object... nodeOutputs) throws SemanticException { + AnnotateStatsProcCtx context = (AnnotateStatsProcCtx) procCtx; + FileSinkOperator sink = (FileSinkOperator) nd; + try { + sink.getStatistics(context.getConf()); + } catch (HiveException e) { + throw new SemanticException(e); + } + return false; + } + +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/AnnotateStatsProcCtx.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/AnnotateStatsProcCtx.java new file mode 100644 index 0000000..f6751f1 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/AnnotateStatsProcCtx.java @@ -0,0 +1,58 @@ +package org.apache.hadoop.hive.ql.optimizer; + +import java.util.Map; + +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx; +import org.apache.hadoop.hive.ql.parse.ParseContext; +import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; +import org.apache.hadoop.hive.ql.plan.Statistics; + +public class AnnotateStatsProcCtx implements NodeProcessorCtx { + + private ParseContext pctx; + private HiveConf conf; + private Map> outColExprMap = null; + private Statistics andExprStats = null; + + public AnnotateStatsProcCtx(ParseContext pctx) { + this.setParseContext(pctx); + if(pctx != null) { + this.setConf(pctx.getConf()); + } else { + this.setConf(null); + } + } + + public HiveConf getConf() { + return conf; + } + + public void setConf(HiveConf conf) { + this.conf = conf; + } + + public ParseContext getParseContext() { + return pctx; + } + + public void setParseContext(ParseContext pctx) { + this.pctx = pctx; + } + + public Map> getOutColExprMap() { + return outColExprMap; + } + + public void setOutColExprMap(Map> outColExprMap) { + this.outColExprMap = outColExprMap; + } + + public Statistics getAndExprStats() { + return andExprStats; + } + + public void setAndExprStats(Statistics andExprStats) { + this.andExprStats = andExprStats; + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/AnnotateWithStatistics.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/AnnotateWithStatistics.java new file mode 100644 index 0000000..b2269f6 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/AnnotateWithStatistics.java @@ -0,0 +1,64 @@ +package org.apache.hadoop.hive.ql.optimizer; + +import java.util.ArrayList; +import java.util.LinkedHashMap; +import java.util.Map; +import java.util.Stack; + +import org.apache.hadoop.hive.ql.exec.FileSinkOperator; +import org.apache.hadoop.hive.ql.lib.DefaultGraphWalker; +import org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher; +import org.apache.hadoop.hive.ql.lib.Dispatcher; +import org.apache.hadoop.hive.ql.lib.GraphWalker; +import org.apache.hadoop.hive.ql.lib.Node; +import org.apache.hadoop.hive.ql.lib.NodeProcessor; +import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx; +import org.apache.hadoop.hive.ql.lib.Rule; +import org.apache.hadoop.hive.ql.lib.RuleRegExp; +import org.apache.hadoop.hive.ql.parse.ParseContext; +import org.apache.hadoop.hive.ql.parse.SemanticException; + +public class AnnotateWithStatistics implements Transform { + + @Override + public ParseContext transform(ParseContext pctx) throws SemanticException { + + AnnotateStatsProcCtx aspCtx = new AnnotateStatsProcCtx(pctx); + + // create a walker which walks the tree in a DFS manner while maintaining + // the operator stack. The dispatcher generates the plan from the operator + // tree + Map opRules = new LinkedHashMap(); + opRules.put( + new RuleRegExp(new String("Set statistics - FileSink"), FileSinkOperator.getOperatorName() + + "%"), getAnnotateStatsProc()); + + // The dispatcher fires the processor corresponding to the closest matching + // rule and passes the context along + Dispatcher disp = new DefaultRuleDispatcher(getDefaultProc(), opRules, aspCtx); + GraphWalker ogw = new DefaultGraphWalker(disp); + + // Create a list of topop nodes + ArrayList topNodes = new ArrayList(); + topNodes.addAll(pctx.getTopOps().values()); + ogw.startWalking(topNodes, null); + + return pctx; + } + + private NodeProcessor getAnnotateStatsProc() { + return new AnnotateStatsProc(); + } + + private NodeProcessor getDefaultProc() { + return new NodeProcessor() { + @Override + public Object process(Node nd, + Stack stack, + NodeProcessorCtx procCtx, + Object... nodeOutputs) throws SemanticException { + return null; + } + }; + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ColumnPrunerProcFactory.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ColumnPrunerProcFactory.java index 0798470..6ffa595 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ColumnPrunerProcFactory.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ColumnPrunerProcFactory.java @@ -583,7 +583,7 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx ctx, List cols = cppCtx.genColLists(op); SelectDesc conf = op.getConf(); - + if (lvJoin != null) { // get columns for SEL(*) from LVJ RowResolver rr = cppCtx.getOpToParseCtxMap().get(op).getRowResolver(); diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java index 3a76bfc..b7a7685 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java @@ -111,6 +111,10 @@ public void initialize(HiveConf hiveConf) { if (HiveConf.getFloatVar(hiveConf, HiveConf.ConfVars.HIVELIMITPUSHDOWNMEMORYUSAGE) > 0) { transformations.add(new LimitPushdownOptimizer()); } + // for now we will do annotation only in explain + if (pctx.getContext().getExplain()) { + transformations.add(new AnnotateWithStatistics()); + } transformations.add(new SimpleFetchOptimizer()); // must be called last if (HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVEFETCHTASKAGGR)) { diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/PrunerOperatorFactory.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/PrunerOperatorFactory.java index 51464e5..0004a0e 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/PrunerOperatorFactory.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/PrunerOperatorFactory.java @@ -125,6 +125,9 @@ protected void addPruningPred(Map opToPrunner, // Put the mapping from table scan operator to pruner_pred opToPrunner.put(top, pruner_pred); + // Set the predicate in the table directly + top.getConf().setPruningPredicate(pruner_pred); + return; } @@ -165,6 +168,9 @@ protected void addPruningPred(Map> // Put the mapping from table scan operator to part-pruner map opToPrunner.put(top, partToPruner); + // Set the predicate in the table directly + top.getConf().setPruningPredicate(pruner_pred); + return; } } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ppr/PartitionPruner.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ppr/PartitionPruner.java index 5412373..99f9345 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ppr/PartitionPruner.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ppr/PartitionPruner.java @@ -157,7 +157,7 @@ public static PrunedPartitionList prune(TableScanOperator ts, ParseContext parse * pruner condition. * @throws HiveException */ - private static PrunedPartitionList prune(Table tab, ExprNodeDesc prunerExpr, + public static PrunedPartitionList prune(Table tab, ExprNodeDesc prunerExpr, HiveConf conf, String alias, Map prunedPartitionsMap) throws HiveException { LOG.trace("Started pruning partiton"); diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/MapReduceCompiler.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/MapReduceCompiler.java index e67d73d..e19da14 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/parse/MapReduceCompiler.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/MapReduceCompiler.java @@ -234,6 +234,7 @@ public void compile(final ParseContext pCtx, final List opRules = new LinkedHashMap(); + opRules.put(new RuleRegExp(new String("R1"), TableScanOperator.getOperatorName() + "%"), new GenMRTableScan1()); diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java index c34b261..6aa543d 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java @@ -7810,6 +7810,10 @@ private Operator genTablePlan(String alias, QB qb) throws SemanticException { // Add a mapping from the table scan operator to Table topToTable.put((TableScanOperator) top, tab); + + // set the table in the tablescan descriptor directly + ((TableScanOperator) top).getConf().setTable(tab); + Map props = qb.getTabPropsForAlias(alias); if (props != null) { topToTableProps.put((TableScanOperator) top, props); diff --git a/ql/src/java/org/apache/hadoop/hive/ql/plan/AbstractOperatorDesc.java b/ql/src/java/org/apache/hadoop/hive/ql/plan/AbstractOperatorDesc.java index c096a65..24694ef 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/plan/AbstractOperatorDesc.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/plan/AbstractOperatorDesc.java @@ -21,6 +21,18 @@ public class AbstractOperatorDesc implements OperatorDesc { private boolean vectorMode = false; + protected transient Statistics statistics; + + @Override + @Explain(displayName = "Statistics", normalExplain = false) + public Statistics getStatistics() { + return statistics; + } + + @Override + public void setStatistics(Statistics statistics) { + this.statistics = statistics; + } @Override public Object clone() throws CloneNotSupportedException { diff --git a/ql/src/java/org/apache/hadoop/hive/ql/plan/ExplainColumnStatistics.java b/ql/src/java/org/apache/hadoop/hive/ql/plan/ExplainColumnStatistics.java new file mode 100644 index 0000000..73eefa2 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/plan/ExplainColumnStatistics.java @@ -0,0 +1,138 @@ +package org.apache.hadoop.hive.ql.plan; + +public class ExplainColumnStatistics { + private String colName = null; + private String colType = null; + private Long numDVs = null; + private Long numNulls = null; + private Long numTrues = null; + private Long numFalses = null; + private Long avgColLen = null; + private Long maxColLen = null; + private Long maxLong = null; + private Long minLong = null; + private Double maxDouble = null; + private Double minDouble = null; + + // @Explain(displayName = "colName") + public String getColName() { + return colName; + } + + public void setColName(String colName) { + this.colName = colName; + } + + // @Explain(displayName = "colType") + public String getColType() { + return colType; + } + + public void setColType(String colType) { + this.colType = colType; + } + + // @Explain(displayName = "numDistincts") + public Long getNumDVs() { + return numDVs; + } + + public void setNumDVs(Long numDVs) { + this.numDVs = numDVs; + } + + // @Explain(displayName = "numNulls") + public Long getNumNulls() { + return numNulls; + } + + public void setNumNulls(Long numNulls) { + this.numNulls = numNulls; + } + + // @Explain(displayName = "numTrues") + public Long getNumTrues() { + return numTrues; + } + + public void setNumTrues(Long numTrues) { + this.numTrues = numTrues; + } + + // @Explain(displayName = "numFalses") + public Long getNumFalses() { + return numFalses; + } + + public void setNumFalses(Long numFalses) { + this.numFalses = numFalses; + } + + // @Explain(displayName = "avgColLen") + public Long getAvgColLen() { + return avgColLen; + } + + public void setAvgColLen(Long avgColLen) { + this.avgColLen = avgColLen; + } + + // @Explain(displayName = "maxColLen") + public Long getMaxColLen() { + return maxColLen; + } + + public void setMaxColLen(Long maxColLen) { + this.maxColLen = maxColLen; + } + + // @Explain(displayName = "maxValue") + public Long getMaxLong() { + return maxLong; + } + + public void setMaxLong(Long maxLong) { + this.maxLong = maxLong; + } + + // @Explain(displayName = "minValue") + public Long getMinLong() { + return minLong; + } + + public void setMinLong(Long minLong) { + this.minLong = minLong; + } + + // @Explain(displayName = "maxValue") + public Double getMaxDouble() { + return maxDouble; + } + + public void setMaxDouble(Double maxDouble) { + this.maxDouble = maxDouble; + } + + // @Explain(displayName = "minValue") + public Double getMinDouble() { + return minDouble; + } + + public void setMinDouble(Double minDouble) { + this.minDouble = minDouble; + } + + @Override + public String toString() { + StringBuilder sb = new StringBuilder(); + if (colName != null) { + sb.append("colName: "); + sb.append(colName); + if (colType != null) { + sb.append("colType: "); + sb.append(colType); + } + } + return sb.toString(); + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/plan/OperatorDesc.java b/ql/src/java/org/apache/hadoop/hive/ql/plan/OperatorDesc.java index 36757e8..6c2efaf 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/plan/OperatorDesc.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/plan/OperatorDesc.java @@ -22,4 +22,6 @@ public interface OperatorDesc extends Serializable, Cloneable { public Object clone() throws CloneNotSupportedException; + public Statistics getStatistics(); + public void setStatistics(Statistics statistics); } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/plan/Statistics.java b/ql/src/java/org/apache/hadoop/hive/ql/plan/Statistics.java new file mode 100644 index 0000000..9ef052e --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/plan/Statistics.java @@ -0,0 +1,360 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.plan; + +import java.io.Serializable; +import java.util.List; +import java.util.Map; + +import org.apache.hadoop.hive.metastore.api.ColumnStatistics; + +import com.google.common.collect.Lists; +import com.google.common.collect.Maps; + +/** + * Statistics. Describes the output of an operator in terms of size, rows, etc + * based on estimates. + */ +@SuppressWarnings("serial") +public class Statistics implements Serializable { + + // partial stats is not very reliable flag. Even if this flag is set to 'false', + // it may not be reliable. The reason for this is that hive has various ways + // to perform DDL operations. If a table is externally managed or if a partition + // is added by disabling hive.stats.autogather then the stats reported by + // metastore will not be reliable. Reliability of the stats should be taken care + // by hive DDL by setting/maintaining flags in metastore. + public enum State { + NONE, PARTIAL, COMPLETE + } + + public enum Level { + TABLE, PARTITION, BOTH + } + + // table related information + private String dbName; + private String tableName; + private List allColumns; + private List selectedColumns; + + // basic stats + private long numRows; + private long rawDataSize; + + // basic stats state for table/partition + private State basicStatsState; + + // column stats state for table/partition + private State colStatsState; + + // table level or partition level + private Level level; + + private List tableColStats; + private Map> partColStats; + + // output column expression map is required by some operators to find mapping + // between column aliases vs internal column names + private Map> outColExprMap; + + // maintains the mapping between the original table name and its alias + private Map tableAliasMap; + + // AND expression statistics is a clone of original statistics object which is + // used by filter operator. When applying statistics rules for AND expression + // the updated stats object is used cascadingly. So we need a clone of stats + // object whose stats are updated cascadingly by AND operator. All other + // expressions (OR, NOT etc.) apply the rules independent of each other and + // so we can reuse the original stats object. + private Statistics andExprStats; + + public Statistics(String dbName, String tabName, List cols) { + resetStats(); + this.dbName = dbName; + this.tableName = tabName; + this.allColumns = cols; + } + + public Statistics() { + resetStats(); + } + + private void resetStats() { + this.basicStatsState = State.NONE; + this.colStatsState = State.NONE; + this.level = Level.TABLE; + this.numRows = 0; + this.rawDataSize = 0; + this.dbName = null; + this.tableName = null; + this.allColumns = null; + this.selectedColumns = null; + this.tableColStats = null; + this.partColStats = null; + this.outColExprMap = null; + this.andExprStats = null; + this.tableAliasMap = null; + } + + public long getNumRows() { + return numRows; + } + + public void setNumRows(long nr) { + this.numRows = nr; + } + + public void addToNumberOfRows(long nr) { + + if (nr < 0) { + // don't count + return; + } + + this.numRows += nr; + } + + public long getRawDataSize() { + return rawDataSize; + } + + public void setRawDataSize(long ars) { + this.rawDataSize = ars; + } + + public void addToRawDataSize(long rds) { + + if (rds < 0) { + // don't count + return; + } + + this.rawDataSize += rds; + } + + @Override + @Explain(displayName="") + public String toString() { + return "numRows: " + numRows + ", rawDataSize: " + rawDataSize + ", basicStatsState: " + + basicStatsState + ", statsLevel: " + level + ", colStatsState: " + colStatsState + ""; + } + + public String getDbName() { + return dbName; + } + + public void setDbName(String dbName) { + this.dbName = dbName; + } + + public String getTableName() { + return tableName; + } + + public void setTableName(String tableName) { + this.tableName = tableName; + } + + public List getTableColStats() { + return tableColStats; + } + + public void setTableColStats(List colStats) { + this.tableColStats = colStats; + } + + public List getAllColumns() { + return allColumns; + } + + public void setAllColumns(List columns) { + this.allColumns = columns; + } + + public long getAvgRowSize() { + if (numRows != 0) { + return rawDataSize / numRows; + } + return -1; + } + + public Map> getOutColExprMap() { + return outColExprMap; + } + + public void setOutColExprMap(Map> outColExprMap) { + this.outColExprMap = outColExprMap; + } + + @Override + public Statistics clone() throws CloneNotSupportedException { + List cloneCols = Lists.newArrayList(); + for (String col : this.allColumns) { + cloneCols.add(col); + } + Statistics clone = new Statistics(this.dbName, this.tableName, cloneCols); + clone.numRows = this.numRows; + clone.rawDataSize = this.rawDataSize; + clone.basicStatsState = this.basicStatsState; + clone.colStatsState = this.colStatsState; + clone.level = this.level; + // do we need to clone column statistics as well? + // clone.tableColStats = getCloneColumnStatistics(this.tableColStats); + // + // Map> clonePCS = Maps.newHashMap(); + // for(Entry> entry : this.partColStats.entrySet()) { + // clonePCS.put(entry.getKey(), getCloneColumnStatistics(entry.getValue())); + // } + // clone.partColStats = clonePCS; + // clone.outColExprMap = null; + return clone; + } + + private List getCloneColumnStatistics(List colStats) { + List cloneTCS = Lists.newArrayList(); + for (ColumnStatistics cs : colStats) { + cloneTCS.add(new ColumnStatistics(cs)); + } + return cloneTCS; + } + + public Statistics getAndExprStats() { + return andExprStats; + } + + public void setAndExprStats(Statistics andExprStats) { + this.andExprStats = andExprStats; + } + + public void addPartitionColStats(String name, List colStats) { + if (partColStats == null) { + partColStats = Maps.newHashMap(); + } + partColStats.put(name, colStats); + } + + public void setPartitionColStats(Map> partColStats) { + this.partColStats = partColStats; + } + + public Map> getPartitionColStats() { + return partColStats; + } + + // TODO: write unit tests for statistics merging + public void merge(Statistics stats) { + if (stats.getDbName() != null && !stats.getDbName().equals(this.dbName)) { + this.dbName += "," + stats.getDbName(); + } + + if (stats.getTableName() != null && !stats.getTableName().equals(this.tableName)) { + this.tableName += "," + stats.getTableName(); + } + + if (stats.getAllColumns() != null && !stats.getAllColumns().equals(this.allColumns)) { + this.allColumns.addAll(stats.getAllColumns()); + } + + if (stats.getSelectedColumns() != null && !stats.getSelectedColumns().equals(this.allColumns)) { + this.allColumns.addAll(stats.getSelectedColumns()); + } + + if (this.basicStatsState.equals(State.NONE) || this.basicStatsState.equals(State.COMPLETE)) { + this.basicStatsState = stats.getBasicStatsState(); + } + + if (this.colStatsState.equals(State.NONE) || this.colStatsState.equals(State.COMPLETE)) { + this.colStatsState = stats.getColStatsState(); + } + + if ((this.level.equals(Level.TABLE) && stats.getLevel().equals(Level.PARTITION)) || + (this.level.equals(Level.PARTITION) && stats.getLevel().equals(Level.TABLE)) || + stats.getLevel().equals(Level.BOTH)) { + this.level = Level.BOTH; + } + + this.addToNumberOfRows(stats.getNumRows()); + this.addToRawDataSize(stats.getRawDataSize()); + + // FIXME: what if both tables have same column names? + if (this.tableColStats == null) { + this.tableColStats = stats.getTableColStats(); + } else { + this.tableColStats.addAll(stats.getTableColStats()); + } + + if (this.partColStats == null) { + this.partColStats = stats.getPartitionColStats(); + } else { + this.partColStats.putAll(stats.getPartitionColStats()); + } + + if (this.andExprStats == null && stats.getAndExprStats() != null) { + this.andExprStats.merge(stats.getAndExprStats()); + } + + // FIXME: how to handle same key names? + if (this.outColExprMap == null) { + this.outColExprMap = stats.getOutColExprMap(); + } else { + this.outColExprMap.putAll(stats.getOutColExprMap()); + } + } + + public State getBasicStatsState() { + return basicStatsState; + } + + public void setBasicStatsState(State basicStatsState) { + this.basicStatsState = basicStatsState; + } + + public State getColStatsState() { + return colStatsState; + } + + public void setColStatsState(State colStatsState) { + this.colStatsState = colStatsState; + } + + public Level getLevel() { + return level; + } + + public void setLevel(Level level) { + this.level = level; + } + + public Map getTableAliasMap() { + return tableAliasMap; + } + + public void setTableAliasMap(Map tableAliasMap) { + this.tableAliasMap = tableAliasMap; + } + + public List getSelectedColumns() { + return selectedColumns; + } + + public void setSelectedColumns(List selectedColumns) { + this.selectedColumns = selectedColumns; + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/plan/TableScanDesc.java b/ql/src/java/org/apache/hadoop/hive/ql/plan/TableScanDesc.java index ec2f8f2..04f5475 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/plan/TableScanDesc.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/plan/TableScanDesc.java @@ -22,6 +22,7 @@ import java.util.List; import java.util.Map; +import org.apache.hadoop.hive.ql.metadata.Table; import org.apache.hadoop.hive.ql.metadata.VirtualColumn; /** @@ -33,6 +34,9 @@ public class TableScanDesc extends AbstractOperatorDesc { private static final long serialVersionUID = 1L; + private transient Table table; + private transient ExprNodeDesc pruningPredicate; + private String alias; private List virtualCols; @@ -75,6 +79,22 @@ public TableScanDesc() { } + public Table getTable() { + return table; + } + + public void setTable(Table t) { + table = t; + } + + public ExprNodeDesc getPruningPredicate() { + return pruningPredicate; + } + + public void setPruningPredicate(ExprNodeDesc expr) { + pruningPredicate = expr; + } + public TableScanDesc(final String alias) { this.alias = alias; } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java b/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java new file mode 100644 index 0000000..a0dc1d6 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java @@ -0,0 +1,747 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.stats; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.metastore.api.ColumnStatistics; +import org.apache.hadoop.hive.metastore.api.ColumnStatisticsData; +import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj; +import org.apache.hadoop.hive.ql.exec.Operator; +import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator; +import org.apache.hadoop.hive.ql.exec.SelectOperator; +import org.apache.hadoop.hive.ql.exec.TableScanOperator; +import org.apache.hadoop.hive.ql.exec.Utilities; +import org.apache.hadoop.hive.ql.metadata.Hive; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.metadata.Partition; +import org.apache.hadoop.hive.ql.metadata.Table; +import org.apache.hadoop.hive.ql.optimizer.ppr.PartitionPruner; +import org.apache.hadoop.hive.ql.parse.PrunedPartitionList; +import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; +import org.apache.hadoop.hive.ql.plan.OperatorDesc; +import org.apache.hadoop.hive.ql.plan.Statistics; +import org.apache.hadoop.hive.ql.plan.Statistics.Level; +import org.apache.hadoop.hive.ql.plan.Statistics.State; +import org.apache.hadoop.hive.ql.util.JavaDataModel; +import org.apache.hadoop.hive.serde2.objectinspector.ConstantObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.StandardConstantListObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.StandardConstantMapObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.StandardListObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.StandardMapObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.StructField; +import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.UnionObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableBinaryObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableBooleanObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableByteObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableConstantBinaryObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableConstantStringObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableDateObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableDoubleObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableFloatObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableHiveDecimalObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableIntObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableLongObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableShortObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableStringObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableTimestampObjectInspector; +import org.apache.hadoop.io.BytesWritable; + +import com.google.common.collect.Lists; +import com.google.common.collect.Maps; + +public class StatsUtils { + + public static Statistics collectStatistics(String alias, + Table table, + HiveConf conf, + TableScanOperator tableScanOperator, + ExprNodeDesc expr) { + + List colNames = Utilities.getColumnNamesFromFieldSchema(table.getAllCols()); + Statistics stats = new Statistics(table.getDbName(), table.getTableName(), colNames); + Map prunedPartitionsMap = new HashMap(); + List neededColumns = tableScanOperator.getNeededColumns(); + stats.setSelectedColumns(neededColumns); + + if (!table.isPartitioned()) { + stats.setLevel(Level.TABLE); + + long nr = getTableStats(conf, table, StatsSetupConst.ROW_COUNT); + long rds = getTableStats(conf, table, StatsSetupConst.RAW_DATA_SIZE); + // if basic stats are not available then return + if (nr <= 0 && rds <= 0) { + stats.setBasicStatsState(State.NONE); + return stats; + } + + // if any basic stats is missing, mark it as partial stats + if (nr <= 0 || rds <= 0) { + stats.setBasicStatsState(State.PARTIAL); + } + + // if both are available then we have complete basic stats + if (nr > 0 && rds > 0) { + stats.setBasicStatsState(State.COMPLETE); + } + stats.setNumRows(nr); + stats.setRawDataSize(rds); + + List colStats = getTableColumnStats(table, neededColumns); + // if column stats available and if atleast one column doesn't have stats + // then mark it as partial + // TODO: check only projected columns? + if (checkIfColStatsAvailable(colStats) && colStats.contains(null)) { + stats.setColStatsState(State.PARTIAL); + } + + // if column stats available and if all columns have stats then mark it + // as complete + if (checkIfColStatsAvailable(colStats) && !colStats.contains(null)) { + stats.setColStatsState(State.COMPLETE); + } + + // set col stats and mark it as table level col stats + stats.setTableColStats(colStats); + } else { + stats.setLevel(Level.PARTITION); + + // For partitioned tables, get the size of all the partitions + PrunedPartitionList partsList; + try { + partsList = PartitionPruner.prune(table, expr, conf, alias, prunedPartitionsMap); + for (Partition part : partsList.getNotDeniedPartns()) { + Statistics newStats = new Statistics(stats.getDbName(), stats.getTableName(), + stats.getAllColumns()); + newStats.setLevel(Level.PARTITION); + long nr = getPartitionStats(conf, part, StatsSetupConst.ROW_COUNT); + long rds = getPartitionStats(conf, part, StatsSetupConst.RAW_DATA_SIZE); + // if both basic stats are not available then mark it as stats not available + if (nr <= 0 && rds <= 0) { + newStats.setBasicStatsState(State.NONE); + } else if (nr <= 0 || rds <= 0) { + newStats.setBasicStatsState(State.PARTIAL); + } else { + newStats.setBasicStatsState(State.COMPLETE); + } + newStats.setNumRows(nr); + newStats.setRawDataSize(rds); + + List colStats = getPartitionColumnStats(table, part, neededColumns); + if (checkIfColStatsAvailable(colStats) && colStats.contains(null)) { + newStats.setColStatsState(State.PARTIAL); + } else if (checkIfColStatsAvailable(colStats) && !colStats.contains(null)) { + newStats.setColStatsState(State.COMPLETE); + } else { + newStats.setColStatsState(State.NONE); + } + + newStats.addPartitionColStats(part.getName(), colStats); + + stats.merge(newStats); + } + } catch (HiveException e) { + stats.setBasicStatsState(State.NONE); + stats.setColStatsState(State.NONE); + } + } + + return stats; + } + + private static List getPartitionColumnStats(Table table, Partition part, List neededColumns) { + List colStats = Lists.newArrayList(); + for (String col : neededColumns) { + ColumnStatistics colStat = null; + try { + colStat = Hive.get().getPartitionColumnStatistics(table.getDbName(), table.getTableName(), + part.getName(), col); + colStats.add(colStat); + } catch (HiveException e) { + // if we cannot get column statistics for specific columns then ignore + // the exception and add nulls to indicate its partial column statistics + colStats.add(null); + } + } + return colStats; + } + + private static boolean checkIfColStatsAvailable(List colStats) { + for (ColumnStatistics cs : colStats) { + if (cs != null) { + return true; + } + } + return false; + } + + private static List + getTableColumnStats(Table table, List neededColumns) { + + List colStatistics = Lists.newArrayList(); + for (String col : neededColumns) { + ColumnStatistics colStats = null; + try { + colStats = Hive.get().getTableColumnStatistics(table.getDbName(), table.getTableName(), + col); + colStatistics.add(colStats); + } catch (HiveException e) { + // if we cannot get column statistics for specific columns then ignore + // the exception and add nulls to indicate its partial column statistics + colStatistics.add(null); + } + } + + return colStatistics; + } + + private static long getPartitionStats(HiveConf conf, Partition part, String type) { + Path path = part.getPartitionPath(); + + if (StatsSetupConst.ROW_COUNT.equals(type)) { + return getNumRows(part.getParameters().get(type), path); + } else if (StatsSetupConst.RAW_DATA_SIZE.equals(type)) { + return getRawDataSize(conf, part, path); + } + + return 0; + } + + private static long getTableStats(HiveConf conf, Table table, String type) { + Path path = table.getPath(); + + if (StatsSetupConst.ROW_COUNT.equals(type)) { + return getNumRows(table.getProperty(type), path); + } else if (StatsSetupConst.RAW_DATA_SIZE.equals(type)) { + return getRawDataSize(conf, table, path); + } + + return 0; + } + + private static long getRawDataSize(HiveConf conf, Object object, Path path) { + // get raw data size (uncompressed size) from table params. If raw data size is + // not available then get total file size from table params. If total file size + // is also not available then get content summary of the file and read the + // file length. + + // check for raw data size + long size = 0; + Table table = null; + Partition part = null; + String rds = null; + String ts = null; + if (object instanceof Table) { + table = (Table) object; + rds = table.getProperty(StatsSetupConst.RAW_DATA_SIZE); + ts = table.getProperty(StatsSetupConst.TOTAL_SIZE); + } + + if (object instanceof Partition) { + part = (Partition) object; + rds = part.getParameters().get(StatsSetupConst.RAW_DATA_SIZE); + ts = part.getParameters().get(StatsSetupConst.TOTAL_SIZE); + } + + if (rds != null) { + try { + size = Long.valueOf(rds); + } catch (NumberFormatException e) { + size = 0; + } + } + + // check for total file size + if (size == 0 && ts != null) { + try { + size = Long.valueOf(ts); + } catch (NumberFormatException e) { + size = 0; + } + } + + if (size == 0) { + try { + FileSystem fs = path.getFileSystem(conf); + size = fs.getContentSummary(path).getLength(); + } catch (Exception e) { + size = 0; + } + } + return size; + } + + private static long getNumRows(String nr, Path path) { + // If the size is present in the metastore, use it + if (nr != null) { + try { + return Long.valueOf(nr); + } catch (NumberFormatException e) { + return 0; + } + } + + return 0; + } + + public static List getColumnStatsForSelectedColumns(Statistics stats, + boolean[] colSelections) throws HiveException { + List result = Lists.newArrayList(); + int idx = 0; + for (boolean selected : colSelections) { + if (selected) { + result.add(stats.getTableColStats().get(idx)); + } + idx++; + } + return result; + } + + public static long getRawDataSizeForSelectedColumns(Statistics stats, + List selColStats) { + long result = 0; + for (ColumnStatistics cs : selColStats) { + for (ColumnStatisticsObj cso : cs.getStatsObj()) { + ColumnStatisticsData csd = cso.getStatsData(); + String colType = cso.getColType(); + + if (colType.equalsIgnoreCase("string")) { + if (csd.isSetStringStats()) { + result += (stats.getNumRows() - csd.getStringStats().getNumNulls()) + * csd.getStringStats().getAvgColLen(); + } else { + // TODO: assume worst case? + } + } else if (colType.equalsIgnoreCase("binary")) { + if (csd.isSetBinaryStats()) { + result += (stats.getNumRows() - csd.getBinaryStats().getNumNulls()) + * csd.getBinaryStats().getAvgColLen(); + } else { + // TODO: assume worst case? + } + } else if (colType.equalsIgnoreCase("tinyint") || colType.equalsIgnoreCase("smallint") + || colType.equalsIgnoreCase("int") || colType.equalsIgnoreCase("bigint")) { + int typeSize = 0; + if (colType.equalsIgnoreCase("bigint")) { + typeSize = JavaDataModel.get().primitive2(); + } else { + typeSize = JavaDataModel.get().primitive1(); + } + if (csd.isSetLongStats()) { + result += (stats.getNumRows() - csd.getLongStats().getNumNulls()) + * typeSize; + } else { + result += stats.getNumRows() * typeSize; + } + } else if (colType.equalsIgnoreCase("boolean")) { + result += stats.getNumRows() * JavaDataModel.get().primitive1(); + } else if (colType.equalsIgnoreCase("timestamp")) { + result += stats.getNumRows() * JavaDataModel.get().lengthOfTimestamp(); + } else if (colType.equalsIgnoreCase("decimal")) { + result += stats.getNumRows() * JavaDataModel.get().lengthOfDecimal(); + } else if (colType.equalsIgnoreCase("date")) { + result += stats.getNumRows() * JavaDataModel.get().lengthOfDate(); + } else if (colType.equalsIgnoreCase("float") || colType.equalsIgnoreCase("double")) { + int typeSize = 0; + if (colType.equalsIgnoreCase("double")) { + typeSize = JavaDataModel.get().primitive2(); + } else { + typeSize = JavaDataModel.get().primitive1(); + } + if (csd.isSetDoubleStats()) { + result += (stats.getNumRows() - csd.getDoubleStats().getNumNulls()) * typeSize; + } else { + result += stats.getNumRows() * typeSize; + } + } else if (colType.equalsIgnoreCase("struct")) { + // TODO: + } else if (colType.equalsIgnoreCase("list")) { + // TODO: + } else if (colType.equalsIgnoreCase("map")) { + // TODO: + } else if (colType.equalsIgnoreCase("union")) { + // TODO: + } else { + // TODO: + } + } + } + return result; + } + + public static long getSizeOfVariableLengthTypes(ObjectInspector oi, String colType) { + if (colType.equalsIgnoreCase("string")) { + if (oi instanceof ConstantObjectInspector) { + ConstantObjectInspector coi = (ConstantObjectInspector) oi; + if (coi.getWritableConstantValue() == null) { + return 0; + } + return coi.getWritableConstantValue().toString().length(); + } else if (oi instanceof WritableConstantStringObjectInspector) { + WritableConstantStringObjectInspector wcsoi = (WritableConstantStringObjectInspector) oi; + return wcsoi.getWritableConstantValue().toString().length(); + } else if (oi instanceof WritableStringObjectInspector) { + // FIXME: we don't support writable objects. we support only constants + // in projection. Output of UDFs should also accounted for raw data size + return 0; + } + } else if (colType.equalsIgnoreCase("binary")) { + if (oi instanceof ConstantObjectInspector) { + ConstantObjectInspector coi = (ConstantObjectInspector) oi; + if (coi.getWritableConstantValue() == null) { + return 0; + } + BytesWritable bw = ((BytesWritable) coi.getWritableConstantValue()); + return bw.getLength(); + } else if (oi instanceof WritableConstantBinaryObjectInspector) { + WritableConstantBinaryObjectInspector wcboi = (WritableConstantBinaryObjectInspector) oi; + return wcboi.getWritableConstantValue().getLength(); + } else if (oi instanceof WritableBinaryObjectInspector) { + // FIXME: we don't support writable objects. we support only constants + // in projection. Output of UDFs should also accounted for raw data size + return 0; + } + } else { + // complex types (map, list, struct, union) + return getSizeOfComplexTypes(oi); + } + + return 0; + } + + private static long getSizeOfComplexTypes(ObjectInspector oi) { + long result = 0; + int length = 0; + switch (oi.getCategory()) { + case PRIMITIVE: + String colType = oi.getTypeName(); + if (colType.equalsIgnoreCase("string") || colType.equalsIgnoreCase("binary")) { + result += getSizeOfVariableLengthTypes(oi, colType); + } else { + result += getSizeOfFixedLengthPrimitivesFromType(colType); + } + break; + case LIST: + if (oi instanceof StandardConstantListObjectInspector) { + StandardConstantListObjectInspector scloi = (StandardConstantListObjectInspector) oi; + length = scloi.getWritableConstantValue().size(); + result += length * getSizeOfComplexTypes(scloi.getListElementObjectInspector()); + } else { + StandardListObjectInspector sloi = (StandardListObjectInspector) oi; + result += getSizeOfComplexTypes(sloi.getListElementObjectInspector()); + } + break; + case MAP: + if (oi instanceof StandardConstantMapObjectInspector) { + StandardConstantMapObjectInspector scmoi = (StandardConstantMapObjectInspector) oi; + result += getSizeOfMap(scmoi); + } else { + StandardMapObjectInspector smoi = (StandardMapObjectInspector) oi; + result += getSizeOfComplexTypes(smoi.getMapKeyObjectInspector()); + result += getSizeOfComplexTypes(smoi.getMapValueObjectInspector()); + } + break; + case STRUCT: + StructObjectInspector soi = (StructObjectInspector) oi; + for (StructField field : soi.getAllStructFieldRefs()) { + result += getSizeOfComplexTypes(field.getFieldObjectInspector()); + } + break; + case UNION: + UnionObjectInspector uoi = (UnionObjectInspector) oi; + for (ObjectInspector foi : uoi.getObjectInspectors()) { + result += getSizeOfComplexTypes(foi); + } + break; + default: + break; + } + return result; + } + + public static long getSizeOfFixedLengthPrimitivesFromType(String colType) { + if (colType.equalsIgnoreCase("tinyint") || + colType.equalsIgnoreCase("smallint") || + colType.equalsIgnoreCase("int") || + colType.equalsIgnoreCase("boolean") || + colType.equalsIgnoreCase("byte") || + colType.equalsIgnoreCase("float")) { + return JavaDataModel.get().primitive1(); + } else if (colType.equalsIgnoreCase("double") || + colType.equalsIgnoreCase("bigint")) { + return JavaDataModel.get().primitive2(); + } else { + return 0; + } + } + + private static long getSizeOfMap(StandardConstantMapObjectInspector scmoi) { + Map map = scmoi.getWritableConstantValue(); + ObjectInspector koi = scmoi.getMapKeyObjectInspector(); + ObjectInspector voi = scmoi.getMapValueObjectInspector(); + long result = 0; + for (Map.Entry entry : map.entrySet()) { + result += getWritableSize(koi, entry.getKey()); + result += getWritableSize(voi, entry.getValue()); + } + return result; + } + + public static long getWritableSize(ObjectInspector oi, Object value) { + if (oi instanceof WritableStringObjectInspector) { + WritableStringObjectInspector woi = (WritableStringObjectInspector) oi; + return woi.getPrimitiveWritableObject(value).getLength(); + } else if (oi instanceof WritableBinaryObjectInspector) { + WritableBinaryObjectInspector woi = (WritableBinaryObjectInspector) oi; + return woi.getPrimitiveWritableObject(value).getLength(); + } else if (oi instanceof WritableBooleanObjectInspector) { + return JavaDataModel.get().primitive1(); + } else if (oi instanceof WritableByteObjectInspector) { + return JavaDataModel.get().primitive1(); + } else if (oi instanceof WritableDateObjectInspector) { + return JavaDataModel.get().lengthOfDate(); + } else if (oi instanceof WritableDoubleObjectInspector) { + return JavaDataModel.get().primitive2(); + } else if (oi instanceof WritableFloatObjectInspector) { + return JavaDataModel.get().primitive1(); + } else if (oi instanceof WritableHiveDecimalObjectInspector) { + return JavaDataModel.get().lengthOfDecimal(); + } else if (oi instanceof WritableIntObjectInspector) { + return JavaDataModel.get().primitive1(); + } else if (oi instanceof WritableLongObjectInspector) { + return JavaDataModel.get().primitive2(); + } else if (oi instanceof WritableShortObjectInspector) { + return JavaDataModel.get().primitive1(); + } else if (oi instanceof WritableTimestampObjectInspector) { + return JavaDataModel.get().lengthOfTimestamp(); + } + + return 0; + } + + public static void updateStats(Statistics stats, long newNumRows) { + long avgRowSize = stats.getAvgRowSize(); + stats.setNumRows(newNumRows); + stats.setRawDataSize(newNumRows * avgRowSize); + } + + public static long getDistinctCountOfColumnFromTable(Statistics stats, String dbName, + String tabName, String colName) { + for (ColumnStatistics cs : stats.getTableColStats()) { + // skip the columns for which we don't have statistics + + // check for valid db and table names + if (cs != null && cs.getStatsDesc().getDbName().equalsIgnoreCase(dbName) && + cs.getStatsDesc().getTableName().equalsIgnoreCase(tabName)) { + for (ColumnStatisticsObj cso : cs.getStatsObj()) { + if (cso.getColName().equalsIgnoreCase(colName)) { + String colType = cso.getColType(); + if (colType.equalsIgnoreCase("tinyint") || + colType.equalsIgnoreCase("smallint") || + colType.equalsIgnoreCase("int") || + colType.equalsIgnoreCase("bigint")) { + return cso.getStatsData().getLongStats().getNumDVs(); + } else if (colType.equalsIgnoreCase("double") || + colType.equalsIgnoreCase("float")) { + return cso.getStatsData().getDoubleStats().getNumDVs(); + } else if (colType.equalsIgnoreCase("string")) { + return cso.getStatsData().getStringStats().getNumDVs(); + } else if (colType.equalsIgnoreCase("boolean")) { + return 2; + } + } + } + } + } + return 0; + } + + public static long getDistinctCountOfColumnFromPartition(Statistics stats, String dbName, + String partName, String colName) { + for (ColumnStatistics cs : stats.getTableColStats()) { + // skip the columns for which we don't have statistics + + // check for valid db and partition names + if (cs != null && cs.getStatsDesc().getDbName().equalsIgnoreCase(dbName) && + cs.getStatsDesc().getPartName().equalsIgnoreCase(partName)) { + for (ColumnStatisticsObj cso : cs.getStatsObj()) { + if (cso.getColName().equalsIgnoreCase(colName)) { + String colType = cso.getColType(); + if (colType.equalsIgnoreCase("tinyint") || + colType.equalsIgnoreCase("smallint") || + colType.equalsIgnoreCase("int") || + colType.equalsIgnoreCase("bigint")) { + return cso.getStatsData().getLongStats().getNumDVs(); + } else if (colType.equalsIgnoreCase("double") || + colType.equalsIgnoreCase("float")) { + return cso.getStatsData().getDoubleStats().getNumDVs(); + } else if (colType.equalsIgnoreCase("string")) { + return cso.getStatsData().getStringStats().getNumDVs(); + } else if (colType.equalsIgnoreCase("boolean")) { + return 2; + } + } + } + } + } + return 0; + } + + public static void getExtToIntColMappings(Operator curOp, + Map> outColExprMap) { + if (curOp == null) { + return; + } + + for (Operator op : curOp.getParentOperators()) { + if (op instanceof TableScanOperator) { + TableScanOperator tsop = (TableScanOperator) op; + String tabName = tsop.getConf().getTable().getTableName(); + + // Special case: no projection on map side. If there is projection in + // map side then column mapping will be done in reduce sink. + if (!checkForMapSideSelectOp(tsop)) { + ReduceSinkOperator rso = getReduceSinkOperator(tsop); + // if we couldn't find RS then it might be FS and map-only job + // in which we don't need column mapping anymore + if (rso == null) { + return; + } + Map colExpMap = Maps.newHashMap(); + ArrayList outValColNames = rso.getConf().getOutputValueColumnNames(); + ArrayList exprNode = rso.getConf().getValueCols(); + for (int i = 0; i < outValColNames.size(); i++) { + colExpMap.put(outValColNames.get(i), exprNode.get(i)); + } + outColExprMap.put(tabName, colExpMap); + } else { + // TS operator should have SEL as its children. The output expression + // map will contain mapping between table column aliases and internal + // names. We need this information to handle the case when projection + // uses alias renaming + Map outColMap = Maps.newHashMap(); + getInternalColumnMapping(tsop, outColMap); + outColExprMap.put(tabName, outColMap); + } + } else { + getExtToIntColMappings(op, outColExprMap); + } + } + } + + private static ReduceSinkOperator getReduceSinkOperator(TableScanOperator tsop) { + Operator op = tsop; + while (op.getChildOperators() != null) { + for (Operator child : op.getChildOperators()) { + // we have hit RS + if (child instanceof ReduceSinkOperator) { + return (ReduceSinkOperator) child; + } + op = child; + } + } + return null; + } + + private static boolean checkForMapSideSelectOp(TableScanOperator tsop) { + Operator op = tsop; + while (op.getChildOperators() != null) { + for (Operator child : op.getChildOperators()) { + // we have hit RS before select so return there is not map side SEL + if (child instanceof ReduceSinkOperator) { + return false; + } + + if (child instanceof SelectOperator) { + return true; + } + op = child; + } + } + return false; + } + + private static void getInternalColumnMapping(Operator curOp, + Map colMapping) { + if (curOp == null) { + return; + } + for (Operator child : curOp.getChildOperators()) { + Map result = child.getColumnExprMap(); + if (result != null) { + for (String key : result.keySet()) { + if (key.startsWith("_col")) { + // we found the internal column mappings + colMapping.putAll(result); + } + } + } else { + getInternalColumnMapping(child, colMapping); + } + } + } + + public static Map> getOuputColumnExprMap( + Operator op, + Statistics stats) { + Map> outColExprMap = stats.getOutColExprMap(); + if (outColExprMap == null) { + outColExprMap = Maps.newHashMap(); + getExtToIntColMappings(op, outColExprMap); + stats.setOutColExprMap(outColExprMap); + } + return outColExprMap; + } + + public static Map getTableAliasMap(Operator op, + Statistics stats) { + Map aliasMap = stats.getTableAliasMap(); + if (aliasMap == null) { + aliasMap = Maps.newHashMap(); + getAliasToTableNameMapping(op, aliasMap); + } + return aliasMap; + } + + private static void getAliasToTableNameMapping(Operator curOp, + Map aliasMap) { + if (curOp == null) { + return; + } + + for (Operator parent : curOp.getParentOperators()) { + if (parent instanceof TableScanOperator) { + TableScanOperator tsop = (TableScanOperator) parent; + String alias = tsop.getConf().getAlias(); + String tabName = tsop.getConf().getTable().getTableName(); + aliasMap.put(alias, tabName); + } else { + getAliasToTableNameMapping(parent, aliasMap); + } + } + } +} diff --git a/ql/src/test/queries/clientpositive/annotate_stats_1.q b/ql/src/test/queries/clientpositive/annotate_stats_1.q new file mode 100644 index 0000000..a4930c0 --- /dev/null +++ b/ql/src/test/queries/clientpositive/annotate_stats_1.q @@ -0,0 +1,39 @@ +create table if not exists emp_staging ( + lastname string, + deptid int +) row format delimited fields terminated by '|' stored as textfile; + +create table if not exists emp_orc like emp_staging; +alter table emp_orc set fileformat orc; + +LOAD DATA LOCAL INPATH '../data/files/emp.txt' OVERWRITE INTO TABLE emp_staging; + +set hive.stats.autogather=false; +set hive.exec.dynamic.partition=true; +set hive.exec.dynamic.partition.mode=nonstrict; + +insert overwrite table emp_orc select * from emp_staging; + +-- stats are disabled. basic stats will report the file size but not raw data size. so initial statistics will be PARTIAL + +-- basicStatState: PARTIAL level: TABLE colStatState: NONE +explain extended select * from emp_orc; + +-- table level analyze statistics +analyze table emp_orc compute statistics; + +-- basicStatState: COMPLETE level: TABLE colStatState: NONE +explain extended select * from emp_orc; + +-- column level partial statistics +analyze table emp_orc compute statistics for columns deptid; +-- basicStatState: COMPLETE level: TABLE colStatState: PARTIAL +explain extended select * from emp_orc; +-- all selected columns have statistics +-- basicStatState: COMPLETE level: TABLE colStatState: COMPLETE +explain extended select deptid from emp_orc; + +-- column level complete statistics +analyze table emp_orc compute statistics for columns lastname,deptid; +-- basicStatState: COMPLETE level: TABLE colStatState: COMPLETE +explain extended select * from emp_orc; diff --git a/ql/src/test/queries/clientpositive/annotate_stats_2.q b/ql/src/test/queries/clientpositive/annotate_stats_2.q new file mode 100644 index 0000000..bb263d8 --- /dev/null +++ b/ql/src/test/queries/clientpositive/annotate_stats_2.q @@ -0,0 +1,69 @@ +create table if not exists loc_staging ( + state string, + locid int, + zip bigint, + year int +) row format delimited fields terminated by '|' stored as textfile; + +LOAD DATA LOCAL INPATH '../data/files/loc.txt' OVERWRITE INTO TABLE loc_staging; + +create table if not exists loc_orc ( + state string, + locid int, + zip bigint +) partitioned by(year int) stored as orc; + +set hive.stats.autogather=false; +set hive.exec.dynamic.partition=true; +set hive.exec.dynamic.partition.mode=nonstrict; + +insert overwrite table loc_orc partition(year) select * from loc_staging; + +-- stats are disabled. basic stats will report the file size but not raw data size. so initial statistics will be PARTIAL + +-- basicStatState: PARTIAL level: PARTITION colStatState: NONE +explain extended select * from loc_orc; + +-- partition level analyze statistics for specific parition +analyze table loc_orc partition(year=2001) compute statistics; +-- basicStatState: PARTIAL level: PARTITION colStatState: NONE +explain extended select * from loc_orc where year='__HIVE_DEFAULT_PARTITION__'; +-- basicStatState: PARTIAL level: PARTITION colStatState: NONE +explain extended select * from loc_orc; +-- basicStatState: COMPLETE level: PARTITION colStatState: NONE +explain extended select * from loc_orc where year=2001; + +-- partition level analyze statistics for all partitions +analyze table loc_orc partition(year) compute statistics; +-- basicStatState: COMPLETE level: PARTITION colStatState: NONE +explain extended select * from loc_orc where year='__HIVE_DEFAULT_PARTITION__'; +-- basicStatState: COMPLETE level: PARTITION colStatState: NONE +explain extended select * from loc_orc; +-- basicStatState: COMPLETE level: PARTITION colStatState: NONE +explain extended select * from loc_orc where year=2001 or year='__HIVE_DEFAULT_PARTITION__'; +-- both partitions will be pruned +-- basicStatState: NONE level: PARTITION colStatState: NONE +explain extended select * from loc_orc where year=2001 and year='__HIVE_DEFAULT_PARTITION__'; + +-- partition level partial column statistics +analyze table loc_orc partition(year=2001) compute statistics for columns state,locid; +-- basicStatState: COMPLETE level: PARTITION colStatState: NONE +explain extended select zip from loc_orc; +-- basicStatState: COMPLETE level: PARTITION colStatState: PARTIAL +explain extended select state from loc_orc; +-- basicStatState: COMPLETE level: PARTITION colStatState: COMPLETE +explain extended select state,locid from loc_orc; +-- basicStatState: COMPLETE level: PARTITION colStatState: PARTIAL +explain extended select * from loc_orc; + +-- partition level column column statistics +analyze table loc_orc partition(year=2001) compute statistics for columns state,locid,zip; +-- partition column +-- basicStatState: PARTIAL level: PARTITION colStatState: NONE +explain extended select year from loc_orc; +-- stats for partition column missing +-- basicStatState: PARTIAL level: PARTITION colStatState: PARTIAL +explain extended select * from loc_orc; +-- stats for all columns are available +-- basicStatState: PARTIAL level: PARTITION colStatState: COMPLETE +explain extended select state,locid,zip from loc_orc; diff --git a/ql/src/test/queries/clientpositive/annotate_stats_3.q b/ql/src/test/queries/clientpositive/annotate_stats_3.q new file mode 100644 index 0000000..fd452f5 --- /dev/null +++ b/ql/src/test/queries/clientpositive/annotate_stats_3.q @@ -0,0 +1,34 @@ +create table if not exists emp ( + lastname string, + deptid int +) row format delimited fields terminated by '|' stored as textfile; + +create table if not exists dept ( + deptid int, + deptname string +) row format delimited fields terminated by '|' stored as textfile; + +create table if not exists loc ( + state string, + locid int, + zip bigint +) row format delimited fields terminated by '|' stored as textfile; + +LOAD DATA LOCAL INPATH '../data/files/emp.txt' OVERWRITE INTO TABLE emp; +LOAD DATA LOCAL INPATH '../data/files/dept.txt' OVERWRITE INTO TABLE dept; +LOAD DATA LOCAL INPATH '../data/files/loc.txt' OVERWRITE INTO TABLE loc; + +-- explain extended should show statistics for these tables +explain extended select * from emp; + +-- column projections +explain extended select lastname from emp; +explain extended select deptid from emp; + +-- constant projection +explain extended select 11 from emp; + +-- UDF projection +explain extended select sum(deptid) from emp; +explain extended select count(*) from emp; + diff --git a/ql/src/test/results/clientpositive/annotate_stats_1.q.out b/ql/src/test/results/clientpositive/annotate_stats_1.q.out new file mode 100644 index 0000000..d0c2f13 --- /dev/null +++ b/ql/src/test/results/clientpositive/annotate_stats_1.q.out @@ -0,0 +1,335 @@ +PREHOOK: query: create table if not exists emp_staging ( + lastname string, + deptid int +) row format delimited fields terminated by '|' stored as textfile +PREHOOK: type: CREATETABLE +POSTHOOK: query: create table if not exists emp_staging ( + lastname string, + deptid int +) row format delimited fields terminated by '|' stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@emp_staging +PREHOOK: query: create table if not exists emp_orc like emp_staging +PREHOOK: type: CREATETABLE +POSTHOOK: query: create table if not exists emp_orc like emp_staging +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@emp_orc +PREHOOK: query: alter table emp_orc set fileformat orc +PREHOOK: type: ALTERTABLE_FILEFORMAT +PREHOOK: Input: default@emp_orc +PREHOOK: Output: default@emp_orc +POSTHOOK: query: alter table emp_orc set fileformat orc +POSTHOOK: type: ALTERTABLE_FILEFORMAT +POSTHOOK: Input: default@emp_orc +POSTHOOK: Output: default@emp_orc +PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/emp.txt' OVERWRITE INTO TABLE emp_staging +PREHOOK: type: LOAD +PREHOOK: Output: default@emp_staging +POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/emp.txt' OVERWRITE INTO TABLE emp_staging +POSTHOOK: type: LOAD +POSTHOOK: Output: default@emp_staging +PREHOOK: query: insert overwrite table emp_orc select * from emp_staging +PREHOOK: type: QUERY +PREHOOK: Input: default@emp_staging +PREHOOK: Output: default@emp_orc +POSTHOOK: query: insert overwrite table emp_orc select * from emp_staging +POSTHOOK: type: QUERY +POSTHOOK: Input: default@emp_staging +POSTHOOK: Output: default@emp_orc +POSTHOOK: Lineage: emp_orc.deptid SIMPLE [(emp_staging)emp_staging.FieldSchema(name:deptid, type:int, comment:null), ] +POSTHOOK: Lineage: emp_orc.lastname SIMPLE [(emp_staging)emp_staging.FieldSchema(name:lastname, type:string, comment:null), ] +PREHOOK: query: -- stats are disabled. basic stats will report the file size but not raw data size. so initial statistics will be PARTIAL + +-- basicStatState: PARTIAL level: TABLE colStatState: NONE +explain extended select * from emp_orc +PREHOOK: type: QUERY +POSTHOOK: query: -- stats are disabled. basic stats will report the file size but not raw data size. so initial statistics will be PARTIAL + +-- basicStatState: PARTIAL level: TABLE colStatState: NONE +explain extended select * from emp_orc +POSTHOOK: type: QUERY +POSTHOOK: Lineage: emp_orc.deptid SIMPLE [(emp_staging)emp_staging.FieldSchema(name:deptid, type:int, comment:null), ] +POSTHOOK: Lineage: emp_orc.lastname SIMPLE [(emp_staging)emp_staging.FieldSchema(name:lastname, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME emp_orc))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)))) + +STAGE DEPENDENCIES: + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + TableScan + alias: emp_orc + Statistics: + numRows: 0, rawDataSize: 300, basicStatsState: PARTIAL, statsLevel: TABLE, colStatsState: NONE + GatherStats: false + Select Operator + expressions: + expr: lastname + type: string + expr: deptid + type: int + outputColumnNames: _col0, _col1 + Statistics: + numRows: 0, rawDataSize: 300, basicStatsState: PARTIAL, statsLevel: TABLE, colStatsState: NONE + ListSink + + +PREHOOK: query: -- table level analyze statistics +analyze table emp_orc compute statistics +PREHOOK: type: QUERY +PREHOOK: Input: default@emp_orc +PREHOOK: Output: default@emp_orc +POSTHOOK: query: -- table level analyze statistics +analyze table emp_orc compute statistics +POSTHOOK: type: QUERY +POSTHOOK: Input: default@emp_orc +POSTHOOK: Output: default@emp_orc +POSTHOOK: Lineage: emp_orc.deptid SIMPLE [(emp_staging)emp_staging.FieldSchema(name:deptid, type:int, comment:null), ] +POSTHOOK: Lineage: emp_orc.lastname SIMPLE [(emp_staging)emp_staging.FieldSchema(name:lastname, type:string, comment:null), ] +PREHOOK: query: -- basicStatState: COMPLETE level: TABLE colStatState: NONE +explain extended select * from emp_orc +PREHOOK: type: QUERY +POSTHOOK: query: -- basicStatState: COMPLETE level: TABLE colStatState: NONE +explain extended select * from emp_orc +POSTHOOK: type: QUERY +POSTHOOK: Lineage: emp_orc.deptid SIMPLE [(emp_staging)emp_staging.FieldSchema(name:deptid, type:int, comment:null), ] +POSTHOOK: Lineage: emp_orc.lastname SIMPLE [(emp_staging)emp_staging.FieldSchema(name:lastname, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME emp_orc))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)))) + +STAGE DEPENDENCIES: + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + TableScan + alias: emp_orc + Statistics: + numRows: 6, rawDataSize: 300, basicStatsState: COMPLETE, statsLevel: TABLE, colStatsState: NONE + GatherStats: false + Select Operator + expressions: + expr: lastname + type: string + expr: deptid + type: int + outputColumnNames: _col0, _col1 + Statistics: + numRows: 6, rawDataSize: 300, basicStatsState: COMPLETE, statsLevel: TABLE, colStatsState: NONE + ListSink + + +PREHOOK: query: -- column level partial statistics +analyze table emp_orc compute statistics for columns deptid +PREHOOK: type: QUERY +PREHOOK: Input: default@emp_orc +#### A masked pattern was here #### +POSTHOOK: query: -- column level partial statistics +analyze table emp_orc compute statistics for columns deptid +POSTHOOK: type: QUERY +POSTHOOK: Input: default@emp_orc +#### A masked pattern was here #### +POSTHOOK: Lineage: emp_orc.deptid SIMPLE [(emp_staging)emp_staging.FieldSchema(name:deptid, type:int, comment:null), ] +POSTHOOK: Lineage: emp_orc.lastname SIMPLE [(emp_staging)emp_staging.FieldSchema(name:lastname, type:string, comment:null), ] +PREHOOK: query: -- basicStatState: COMPLETE level: TABLE colStatState: PARTIAL +explain extended select * from emp_orc +PREHOOK: type: QUERY +POSTHOOK: query: -- basicStatState: COMPLETE level: TABLE colStatState: PARTIAL +explain extended select * from emp_orc +POSTHOOK: type: QUERY +POSTHOOK: Lineage: emp_orc.deptid SIMPLE [(emp_staging)emp_staging.FieldSchema(name:deptid, type:int, comment:null), ] +POSTHOOK: Lineage: emp_orc.lastname SIMPLE [(emp_staging)emp_staging.FieldSchema(name:lastname, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME emp_orc))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)))) + +STAGE DEPENDENCIES: + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + TableScan + alias: emp_orc + Statistics: + numRows: 6, rawDataSize: 300, basicStatsState: COMPLETE, statsLevel: TABLE, colStatsState: PARTIAL + GatherStats: false + Select Operator + expressions: + expr: lastname + type: string + expr: deptid + type: int + outputColumnNames: _col0, _col1 + Statistics: + numRows: 6, rawDataSize: 300, basicStatsState: COMPLETE, statsLevel: TABLE, colStatsState: PARTIAL + ListSink + + +PREHOOK: query: -- all selected columns have statistics +-- basicStatState: COMPLETE level: TABLE colStatState: COMPLETE +explain extended select deptid from emp_orc +PREHOOK: type: QUERY +POSTHOOK: query: -- all selected columns have statistics +-- basicStatState: COMPLETE level: TABLE colStatState: COMPLETE +explain extended select deptid from emp_orc +POSTHOOK: type: QUERY +POSTHOOK: Lineage: emp_orc.deptid SIMPLE [(emp_staging)emp_staging.FieldSchema(name:deptid, type:int, comment:null), ] +POSTHOOK: Lineage: emp_orc.lastname SIMPLE [(emp_staging)emp_staging.FieldSchema(name:lastname, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME emp_orc))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL deptid))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + emp_orc + TableScan + alias: emp_orc + Statistics: + numRows: 6, rawDataSize: 300, basicStatsState: COMPLETE, statsLevel: TABLE, colStatsState: COMPLETE + GatherStats: false + Select Operator + expressions: + expr: deptid + type: int + outputColumnNames: _col0 + Statistics: + numRows: 6, rawDataSize: 20, basicStatsState: COMPLETE, statsLevel: TABLE, colStatsState: COMPLETE + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + Statistics: + numRows: 6, rawDataSize: 20, basicStatsState: COMPLETE, statsLevel: TABLE, colStatsState: COMPLETE +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + columns _col0 + columns.types int + escape.delim \ + hive.serialization.extend.nesting.levels true + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: emp_orc + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + properties: + bucket_count -1 + columns lastname,deptid + columns.types string:int + field.delim | +#### A masked pattern was here #### + name default.emp_orc + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 0 + serialization.ddl struct emp_orc { string lastname, i32 deptid} + serialization.format | + serialization.lib org.apache.hadoop.hive.ql.io.orc.OrcSerde + totalSize 300 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + properties: + bucket_count -1 + columns lastname,deptid + columns.types string:int + field.delim | +#### A masked pattern was here #### + name default.emp_orc + numFiles 1 + numPartitions 0 + numRows 6 + rawDataSize 0 + serialization.ddl struct emp_orc { string lastname, i32 deptid} + serialization.format | + serialization.lib org.apache.hadoop.hive.ql.io.orc.OrcSerde + totalSize 300 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + name: default.emp_orc + name: default.emp_orc + Truncated Path -> Alias: + /emp_orc [emp_orc] + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: -- column level complete statistics +analyze table emp_orc compute statistics for columns lastname,deptid +PREHOOK: type: QUERY +PREHOOK: Input: default@emp_orc +#### A masked pattern was here #### +POSTHOOK: query: -- column level complete statistics +analyze table emp_orc compute statistics for columns lastname,deptid +POSTHOOK: type: QUERY +POSTHOOK: Input: default@emp_orc +#### A masked pattern was here #### +POSTHOOK: Lineage: emp_orc.deptid SIMPLE [(emp_staging)emp_staging.FieldSchema(name:deptid, type:int, comment:null), ] +POSTHOOK: Lineage: emp_orc.lastname SIMPLE [(emp_staging)emp_staging.FieldSchema(name:lastname, type:string, comment:null), ] +PREHOOK: query: -- basicStatState: COMPLETE level: TABLE colStatState: COMPLETE +explain extended select * from emp_orc +PREHOOK: type: QUERY +POSTHOOK: query: -- basicStatState: COMPLETE level: TABLE colStatState: COMPLETE +explain extended select * from emp_orc +POSTHOOK: type: QUERY +POSTHOOK: Lineage: emp_orc.deptid SIMPLE [(emp_staging)emp_staging.FieldSchema(name:deptid, type:int, comment:null), ] +POSTHOOK: Lineage: emp_orc.lastname SIMPLE [(emp_staging)emp_staging.FieldSchema(name:lastname, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME emp_orc))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)))) + +STAGE DEPENDENCIES: + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + TableScan + alias: emp_orc + Statistics: + numRows: 6, rawDataSize: 300, basicStatsState: COMPLETE, statsLevel: TABLE, colStatsState: COMPLETE + GatherStats: false + Select Operator + expressions: + expr: lastname + type: string + expr: deptid + type: int + outputColumnNames: _col0, _col1 + Statistics: + numRows: 6, rawDataSize: 300, basicStatsState: COMPLETE, statsLevel: TABLE, colStatsState: COMPLETE + ListSink + + diff --git a/ql/src/test/results/clientpositive/annotate_stats_2.q.out b/ql/src/test/results/clientpositive/annotate_stats_2.q.out new file mode 100644 index 0000000..9a82dbc --- /dev/null +++ b/ql/src/test/results/clientpositive/annotate_stats_2.q.out @@ -0,0 +1,485 @@ +PREHOOK: query: create table if not exists loc_staging ( + state string, + locid int, + zip bigint, + year int +) row format delimited fields terminated by '|' stored as textfile +PREHOOK: type: CREATETABLE +POSTHOOK: query: create table if not exists loc_staging ( + state string, + locid int, + zip bigint, + year int +) row format delimited fields terminated by '|' stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@loc_staging +PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/loc.txt' OVERWRITE INTO TABLE loc_staging +PREHOOK: type: LOAD +PREHOOK: Output: default@loc_staging +POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/loc.txt' OVERWRITE INTO TABLE loc_staging +POSTHOOK: type: LOAD +POSTHOOK: Output: default@loc_staging +PREHOOK: query: create table if not exists loc_orc ( + state string, + locid int, + zip bigint +) partitioned by(year int) stored as orc +PREHOOK: type: CREATETABLE +POSTHOOK: query: create table if not exists loc_orc ( + state string, + locid int, + zip bigint +) partitioned by(year int) stored as orc +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@loc_orc +PREHOOK: query: insert overwrite table loc_orc partition(year) select * from loc_staging +PREHOOK: type: QUERY +PREHOOK: Input: default@loc_staging +PREHOOK: Output: default@loc_orc +POSTHOOK: query: insert overwrite table loc_orc partition(year) select * from loc_staging +POSTHOOK: type: QUERY +POSTHOOK: Input: default@loc_staging +POSTHOOK: Output: default@loc_orc@year=2001 +POSTHOOK: Output: default@loc_orc@year=__HIVE_DEFAULT_PARTITION__ +POSTHOOK: Lineage: loc_orc PARTITION(year=2001).locid SIMPLE [(loc_staging)loc_staging.FieldSchema(name:locid, type:int, comment:null), ] +POSTHOOK: Lineage: loc_orc PARTITION(year=2001).state SIMPLE [(loc_staging)loc_staging.FieldSchema(name:state, type:string, comment:null), ] +POSTHOOK: Lineage: loc_orc PARTITION(year=2001).zip SIMPLE [(loc_staging)loc_staging.FieldSchema(name:zip, type:bigint, comment:null), ] +POSTHOOK: Lineage: loc_orc PARTITION(year=__HIVE_DEFAULT_PARTITION__).locid SIMPLE [(loc_staging)loc_staging.FieldSchema(name:locid, type:int, comment:null), ] +POSTHOOK: Lineage: loc_orc PARTITION(year=__HIVE_DEFAULT_PARTITION__).state SIMPLE [(loc_staging)loc_staging.FieldSchema(name:state, type:string, comment:null), ] +POSTHOOK: Lineage: loc_orc PARTITION(year=__HIVE_DEFAULT_PARTITION__).zip SIMPLE [(loc_staging)loc_staging.FieldSchema(name:zip, type:bigint, comment:null), ] +PREHOOK: query: -- stats are disabled. basic stats will report the file size but not raw data size. so initial statistics will be PARTIAL + +-- basicStatState: PARTIAL level: PARTITION colStatState: NONE +explain extended select * from loc_orc +PREHOOK: type: QUERY +POSTHOOK: query: -- stats are disabled. basic stats will report the file size but not raw data size. so initial statistics will be PARTIAL + +-- basicStatState: PARTIAL level: PARTITION colStatState: NONE +explain extended select * from loc_orc +POSTHOOK: type: QUERY +POSTHOOK: Lineage: loc_orc PARTITION(year=2001).locid SIMPLE [(loc_staging)loc_staging.FieldSchema(name:locid, type:int, comment:null), ] +POSTHOOK: Lineage: loc_orc PARTITION(year=2001).state SIMPLE [(loc_staging)loc_staging.FieldSchema(name:state, type:string, comment:null), ] +POSTHOOK: Lineage: loc_orc PARTITION(year=2001).zip SIMPLE [(loc_staging)loc_staging.FieldSchema(name:zip, type:bigint, comment:null), ] +POSTHOOK: Lineage: loc_orc PARTITION(year=__HIVE_DEFAULT_PARTITION__).locid SIMPLE [(loc_staging)loc_staging.FieldSchema(name:locid, type:int, comment:null), ] +POSTHOOK: Lineage: loc_orc PARTITION(year=__HIVE_DEFAULT_PARTITION__).state SIMPLE [(loc_staging)loc_staging.FieldSchema(name:state, type:string, comment:null), ] +POSTHOOK: Lineage: loc_orc PARTITION(year=__HIVE_DEFAULT_PARTITION__).zip SIMPLE [(loc_staging)loc_staging.FieldSchema(name:zip, type:bigint, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME loc_orc))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)))) + +STAGE DEPENDENCIES: + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-0 + Fetch Operator + limit: -1 + Partition Description: + Partition + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + partition values: + year 2001 + properties: + bucket_count -1 + columns state,locid,zip + columns.types string:int:bigint +#### A masked pattern was here #### + name default.loc_orc + partition_columns year + serialization.ddl struct loc_orc { string state, i32 locid, i64 zip} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.ql.io.orc.OrcSerde +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + properties: + bucket_count -1 + columns state,locid,zip + columns.types string:int:bigint +#### A masked pattern was here #### + name default.loc_orc + partition_columns year + serialization.ddl struct loc_orc { string state, i32 locid, i64 zip} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.ql.io.orc.OrcSerde +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + name: default.loc_orc + name: default.loc_orc + Partition + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + partition values: + year __HIVE_DEFAULT_PARTITION__ + properties: + bucket_count -1 + columns state,locid,zip + columns.types string:int:bigint +#### A masked pattern was here #### + name default.loc_orc + partition_columns year + serialization.ddl struct loc_orc { string state, i32 locid, i64 zip} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.ql.io.orc.OrcSerde +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + properties: + bucket_count -1 + columns state,locid,zip + columns.types string:int:bigint +#### A masked pattern was here #### + name default.loc_orc + partition_columns year + serialization.ddl struct loc_orc { string state, i32 locid, i64 zip} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.ql.io.orc.OrcSerde +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + name: default.loc_orc + name: default.loc_orc + Processor Tree: + TableScan + alias: loc_orc + Statistics: + numRows: 0, rawDataSize: 621, basicStatsState: PARTIAL, statsLevel: PARTITION, colStatsState: NONE + GatherStats: false + Select Operator + expressions: + expr: state + type: string + expr: locid + type: int + expr: zip + type: bigint + expr: year + type: string + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: + numRows: 0, rawDataSize: 621, basicStatsState: PARTIAL, statsLevel: PARTITION, colStatsState: NONE + ListSink + + +PREHOOK: query: -- table level analyze statistics +analyze table loc_orc partition(year=2001) compute statistics +PREHOOK: type: QUERY +PREHOOK: Input: default@loc_orc +PREHOOK: Input: default@loc_orc@year=2001 +PREHOOK: Output: default@loc_orc +PREHOOK: Output: default@loc_orc@year=2001 +POSTHOOK: query: -- table level analyze statistics +analyze table loc_orc partition(year=2001) compute statistics +POSTHOOK: type: QUERY +POSTHOOK: Input: default@loc_orc +POSTHOOK: Input: default@loc_orc@year=2001 +POSTHOOK: Output: default@loc_orc +POSTHOOK: Output: default@loc_orc@year=2001 +POSTHOOK: Lineage: loc_orc PARTITION(year=2001).locid SIMPLE [(loc_staging)loc_staging.FieldSchema(name:locid, type:int, comment:null), ] +POSTHOOK: Lineage: loc_orc PARTITION(year=2001).state SIMPLE [(loc_staging)loc_staging.FieldSchema(name:state, type:string, comment:null), ] +POSTHOOK: Lineage: loc_orc PARTITION(year=2001).zip SIMPLE [(loc_staging)loc_staging.FieldSchema(name:zip, type:bigint, comment:null), ] +POSTHOOK: Lineage: loc_orc PARTITION(year=__HIVE_DEFAULT_PARTITION__).locid SIMPLE [(loc_staging)loc_staging.FieldSchema(name:locid, type:int, comment:null), ] +POSTHOOK: Lineage: loc_orc PARTITION(year=__HIVE_DEFAULT_PARTITION__).state SIMPLE [(loc_staging)loc_staging.FieldSchema(name:state, type:string, comment:null), ] +POSTHOOK: Lineage: loc_orc PARTITION(year=__HIVE_DEFAULT_PARTITION__).zip SIMPLE [(loc_staging)loc_staging.FieldSchema(name:zip, type:bigint, comment:null), ] +PREHOOK: query: -- basicStatState: PARTIAL level: PARTITION colStatState: NONE +explain extended select * from loc_orc +PREHOOK: type: QUERY +POSTHOOK: query: -- basicStatState: PARTIAL level: PARTITION colStatState: NONE +explain extended select * from loc_orc +POSTHOOK: type: QUERY +POSTHOOK: Lineage: loc_orc PARTITION(year=2001).locid SIMPLE [(loc_staging)loc_staging.FieldSchema(name:locid, type:int, comment:null), ] +POSTHOOK: Lineage: loc_orc PARTITION(year=2001).state SIMPLE [(loc_staging)loc_staging.FieldSchema(name:state, type:string, comment:null), ] +POSTHOOK: Lineage: loc_orc PARTITION(year=2001).zip SIMPLE [(loc_staging)loc_staging.FieldSchema(name:zip, type:bigint, comment:null), ] +POSTHOOK: Lineage: loc_orc PARTITION(year=__HIVE_DEFAULT_PARTITION__).locid SIMPLE [(loc_staging)loc_staging.FieldSchema(name:locid, type:int, comment:null), ] +POSTHOOK: Lineage: loc_orc PARTITION(year=__HIVE_DEFAULT_PARTITION__).state SIMPLE [(loc_staging)loc_staging.FieldSchema(name:state, type:string, comment:null), ] +POSTHOOK: Lineage: loc_orc PARTITION(year=__HIVE_DEFAULT_PARTITION__).zip SIMPLE [(loc_staging)loc_staging.FieldSchema(name:zip, type:bigint, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME loc_orc))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)))) + +STAGE DEPENDENCIES: + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-0 + Fetch Operator + limit: -1 + Partition Description: + Partition + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + partition values: + year 2001 + properties: + bucket_count -1 + columns state,locid,zip + columns.types string:int:bigint +#### A masked pattern was here #### + name default.loc_orc + numFiles 1 + numRows 7 + partition_columns year + rawDataSize 0 + serialization.ddl struct loc_orc { string state, i32 locid, i64 zip} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.ql.io.orc.OrcSerde + totalSize 344 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + properties: + bucket_count -1 + columns state,locid,zip + columns.types string:int:bigint +#### A masked pattern was here #### + name default.loc_orc + numFiles 1 + numPartitions 1 + numRows 7 + partition_columns year + rawDataSize 0 + serialization.ddl struct loc_orc { string state, i32 locid, i64 zip} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.ql.io.orc.OrcSerde + totalSize 344 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + name: default.loc_orc + name: default.loc_orc + Partition + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + partition values: + year __HIVE_DEFAULT_PARTITION__ + properties: + bucket_count -1 + columns state,locid,zip + columns.types string:int:bigint +#### A masked pattern was here #### + name default.loc_orc + partition_columns year + serialization.ddl struct loc_orc { string state, i32 locid, i64 zip} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.ql.io.orc.OrcSerde +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + properties: + bucket_count -1 + columns state,locid,zip + columns.types string:int:bigint +#### A masked pattern was here #### + name default.loc_orc + numFiles 1 + numPartitions 1 + numRows 7 + partition_columns year + rawDataSize 0 + serialization.ddl struct loc_orc { string state, i32 locid, i64 zip} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.ql.io.orc.OrcSerde + totalSize 344 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + name: default.loc_orc + name: default.loc_orc + Processor Tree: + TableScan + alias: loc_orc + Statistics: + numRows: 7, rawDataSize: 621, basicStatsState: PARTIAL, statsLevel: PARTITION, colStatsState: NONE + GatherStats: false + Select Operator + expressions: + expr: state + type: string + expr: locid + type: int + expr: zip + type: bigint + expr: year + type: string + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: + numRows: 7, rawDataSize: 621, basicStatsState: PARTIAL, statsLevel: PARTITION, colStatsState: NONE + ListSink + + +PREHOOK: query: -- basicStatState: COMPLETE level: PARTITION colStatState: NONE +explain extended select * from loc_orc where year=2001 +PREHOOK: type: QUERY +POSTHOOK: query: -- basicStatState: COMPLETE level: PARTITION colStatState: NONE +explain extended select * from loc_orc where year=2001 +POSTHOOK: type: QUERY +POSTHOOK: Lineage: loc_orc PARTITION(year=2001).locid SIMPLE [(loc_staging)loc_staging.FieldSchema(name:locid, type:int, comment:null), ] +POSTHOOK: Lineage: loc_orc PARTITION(year=2001).state SIMPLE [(loc_staging)loc_staging.FieldSchema(name:state, type:string, comment:null), ] +POSTHOOK: Lineage: loc_orc PARTITION(year=2001).zip SIMPLE [(loc_staging)loc_staging.FieldSchema(name:zip, type:bigint, comment:null), ] +POSTHOOK: Lineage: loc_orc PARTITION(year=__HIVE_DEFAULT_PARTITION__).locid SIMPLE [(loc_staging)loc_staging.FieldSchema(name:locid, type:int, comment:null), ] +POSTHOOK: Lineage: loc_orc PARTITION(year=__HIVE_DEFAULT_PARTITION__).state SIMPLE [(loc_staging)loc_staging.FieldSchema(name:state, type:string, comment:null), ] +POSTHOOK: Lineage: loc_orc PARTITION(year=__HIVE_DEFAULT_PARTITION__).zip SIMPLE [(loc_staging)loc_staging.FieldSchema(name:zip, type:bigint, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME loc_orc))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)) (TOK_WHERE (= (TOK_TABLE_OR_COL year) 2001)))) + +STAGE DEPENDENCIES: + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-0 + Fetch Operator + limit: -1 + Partition Description: + Partition + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + partition values: + year 2001 + properties: + bucket_count -1 + columns state,locid,zip + columns.types string:int:bigint +#### A masked pattern was here #### + name default.loc_orc + numFiles 1 + numRows 7 + partition_columns year + rawDataSize 0 + serialization.ddl struct loc_orc { string state, i32 locid, i64 zip} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.ql.io.orc.OrcSerde + totalSize 344 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + properties: + bucket_count -1 + columns state,locid,zip + columns.types string:int:bigint +#### A masked pattern was here #### + name default.loc_orc + numFiles 1 + numPartitions 1 + numRows 7 + partition_columns year + rawDataSize 0 + serialization.ddl struct loc_orc { string state, i32 locid, i64 zip} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.ql.io.orc.OrcSerde + totalSize 344 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + name: default.loc_orc + name: default.loc_orc + Processor Tree: + TableScan + alias: loc_orc + Statistics: + numRows: 7, rawDataSize: 344, basicStatsState: COMPLETE, statsLevel: PARTITION, colStatsState: NONE + GatherStats: false + Select Operator + expressions: + expr: state + type: string + expr: locid + type: int + expr: zip + type: bigint + expr: year + type: string + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: + numRows: 7, rawDataSize: 344, basicStatsState: COMPLETE, statsLevel: PARTITION, colStatsState: NONE + ListSink + + +PREHOOK: query: -- basicStatState: NONE level: PARTITION colStatState: NONE +explain extended select * from loc_orc where year='__HIVE_DEFAULT_PARTITION__' +PREHOOK: type: QUERY +POSTHOOK: query: -- basicStatState: NONE level: PARTITION colStatState: NONE +explain extended select * from loc_orc where year='__HIVE_DEFAULT_PARTITION__' +POSTHOOK: type: QUERY +POSTHOOK: Lineage: loc_orc PARTITION(year=2001).locid SIMPLE [(loc_staging)loc_staging.FieldSchema(name:locid, type:int, comment:null), ] +POSTHOOK: Lineage: loc_orc PARTITION(year=2001).state SIMPLE [(loc_staging)loc_staging.FieldSchema(name:state, type:string, comment:null), ] +POSTHOOK: Lineage: loc_orc PARTITION(year=2001).zip SIMPLE [(loc_staging)loc_staging.FieldSchema(name:zip, type:bigint, comment:null), ] +POSTHOOK: Lineage: loc_orc PARTITION(year=__HIVE_DEFAULT_PARTITION__).locid SIMPLE [(loc_staging)loc_staging.FieldSchema(name:locid, type:int, comment:null), ] +POSTHOOK: Lineage: loc_orc PARTITION(year=__HIVE_DEFAULT_PARTITION__).state SIMPLE [(loc_staging)loc_staging.FieldSchema(name:state, type:string, comment:null), ] +POSTHOOK: Lineage: loc_orc PARTITION(year=__HIVE_DEFAULT_PARTITION__).zip SIMPLE [(loc_staging)loc_staging.FieldSchema(name:zip, type:bigint, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME loc_orc))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)) (TOK_WHERE (= (TOK_TABLE_OR_COL year) '__HIVE_DEFAULT_PARTITION__')))) + +STAGE DEPENDENCIES: + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-0 + Fetch Operator + limit: -1 + Partition Description: + Partition + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + partition values: + year __HIVE_DEFAULT_PARTITION__ + properties: + bucket_count -1 + columns state,locid,zip + columns.types string:int:bigint +#### A masked pattern was here #### + name default.loc_orc + partition_columns year + serialization.ddl struct loc_orc { string state, i32 locid, i64 zip} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.ql.io.orc.OrcSerde +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + properties: + bucket_count -1 + columns state,locid,zip + columns.types string:int:bigint +#### A masked pattern was here #### + name default.loc_orc + numFiles 1 + numPartitions 1 + numRows 7 + partition_columns year + rawDataSize 0 + serialization.ddl struct loc_orc { string state, i32 locid, i64 zip} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.ql.io.orc.OrcSerde + totalSize 344 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + name: default.loc_orc + name: default.loc_orc + Processor Tree: + TableScan + alias: loc_orc + Statistics: + numRows: 0, rawDataSize: 277, basicStatsState: PARTIAL, statsLevel: PARTITION, colStatsState: NONE + GatherStats: false + Select Operator + expressions: + expr: state + type: string + expr: locid + type: int + expr: zip + type: bigint + expr: year + type: string + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: + numRows: 0, rawDataSize: 277, basicStatsState: PARTIAL, statsLevel: PARTITION, colStatsState: NONE + ListSink + +