diff --git a/build.xml b/build.xml index f41db23..3e94bca 100644 --- a/build.xml +++ b/build.xml @@ -52,6 +52,7 @@ + diff --git a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java index 7e5e19f..e45f7e3 100644 --- a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java +++ b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java @@ -345,6 +345,10 @@ public class HiveConf extends Configuration { // For har files HIVEARCHIVEENABLED("hive.archive.enabled", false), HIVEHARPARENTDIRSETTABLE("hive.archive.har.parentdir.settable", false), + + //Enable/Disable gbToIdx rewrite rule + HIVEOPTGBYUSINGINDEX("hive.optimize.gbyusingindex", false), + HIVEOUTERJOINSUPPORTSFILTERS("hive.outerjoin.supports.filters", true), // Serde for FetchTask @@ -368,7 +372,7 @@ public class HiveConf extends Configuration { HIVE_ERROR_ON_EMPTY_PARTITION("hive.error.on.empty.partition", false), - HIVE_INDEX_IGNORE_HDFS_LOC("hive.index.compact.file.ignore.hdfs", false), + HIVE_INDEX_IGNORE_HDFS_LOC("hive.index.compact.file.ignore.hdfs", false), ; diff --git a/ql/src/java/org/apache/hadoop/hive/ql/metadata/..orig b/ql/src/java/org/apache/hadoop/hive/ql/metadata/..orig new file mode 100644 index 0000000..e69de29 diff --git a/ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java b/ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java index 5f78082..a065da9 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java @@ -665,6 +665,16 @@ public class Hive { throw new HiveException(e); } } + public List getIndexesOnTable(String db_name, String tbl_name, + short max) throws HiveException { + try { + return getMSC().listIndexes(db_name, tbl_name, max); + } catch (NoSuchObjectException e) { + throw new HiveException("Partition or table doesn't exist.", e); + } catch (Exception e) { + throw new HiveException("Unknow error. Please check logs.", e); + } + } public boolean dropIndex(String db_name, String tbl_name, String index_name, boolean deleteData) throws HiveException { try { @@ -1476,7 +1486,7 @@ public class Hive { throw new HiveException(e); } } - + /** * Get all existing role names. * diff --git a/ql/src/java/org/apache/hadoop/hive/ql/metadata/Table.java b/ql/src/java/org/apache/hadoop/hive/ql/metadata/Table.java index c55a4ec..5e4a22c 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/metadata/Table.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/metadata/Table.java @@ -38,6 +38,7 @@ import org.apache.hadoop.hive.metastore.MetaStoreUtils; import org.apache.hadoop.hive.metastore.ProtectMode; import org.apache.hadoop.hive.metastore.TableType; import org.apache.hadoop.hive.metastore.api.FieldSchema; +import org.apache.hadoop.hive.metastore.api.Index; import org.apache.hadoop.hive.metastore.api.MetaException; import org.apache.hadoop.hive.metastore.api.Order; import org.apache.hadoop.hive.metastore.api.SerDeInfo; @@ -805,4 +806,15 @@ public class Table implements Serializable { public String getCompleteName() { return getDbName() + "@" + getTableName(); } + + /** + * @return List containing Index Table names if there is exists indexes + * on this table + * @throws HiveException + **/ + public List getAllIndexes(short max) throws HiveException { + Hive hive = Hive.get(); + return hive.getIndexesOnTable(getTTable().getDbName(), getTTable().getTableName(), max); + } + }; diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java index 590d69a..3d7ba1c 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java @@ -49,6 +49,9 @@ public class Optimizer { if (HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVEOPTCP)) { transformations.add(new ColumnPruner()); } + if (HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVEOPTGBYUSINGINDEX)) { + transformations.add(new RewriteGBUsingIndex()); + } if (HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVEOPTPPD)) { transformations.add(new PredicatePushDown()); transformations.add(new PartitionPruner()); diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/RewriteCanApplyCtx.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/RewriteCanApplyCtx.java new file mode 100644 index 0000000..6bf830c --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/RewriteCanApplyCtx.java @@ -0,0 +1,388 @@ +package org.apache.hadoop.hive.ql.optimizer; + +import java.io.Serializable; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.LinkedHashMap; +import java.util.LinkedHashSet; +import java.util.Map; +import java.util.Set; +import java.util.Stack; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.metastore.api.Index; +import org.apache.hadoop.hive.ql.exec.Operator; +import org.apache.hadoop.hive.ql.lib.DefaultGraphWalker; +import org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher; +import org.apache.hadoop.hive.ql.lib.Dispatcher; +import org.apache.hadoop.hive.ql.lib.GraphWalker; +import org.apache.hadoop.hive.ql.lib.Node; +import org.apache.hadoop.hive.ql.lib.NodeProcessor; +import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx; +import org.apache.hadoop.hive.ql.lib.PreOrderWalker; +import org.apache.hadoop.hive.ql.lib.Rule; +import org.apache.hadoop.hive.ql.lib.RuleRegExp; +import org.apache.hadoop.hive.ql.parse.ParseContext; +import org.apache.hadoop.hive.ql.parse.SemanticException; + +/** + * RewriteCanApplyCtx class stores the context for the {@link RewriteCanApplyProcFactory} to determine + * if any index can be used and if the input query meets all the criteria for rewrite optimization. + */ +public final class RewriteCanApplyCtx implements NodeProcessorCtx { + + protected final Log LOG = LogFactory.getLog(RewriteCanApplyCtx.class.getName()); + + private RewriteCanApplyCtx(ParseContext parseContext, HiveConf conf) { + this.parseContext = parseContext; + this.hiveConf = conf; + initRewriteVars(); + } + + public static RewriteCanApplyCtx getInstance(ParseContext parseContext, HiveConf conf){ + return new RewriteCanApplyCtx(parseContext, conf); + } + + public static enum RewriteVars { + AGG_FUNC_CNT("hive.ql.rewrites.agg.func.cnt", 0), + GBY_KEY_CNT("hive.ql.rewrites.gby.key.cnt", 0), + QUERY_HAS_SORT_BY("hive.ql.rewrites.query.has.sort.by", false), + QUERY_HAS_ORDER_BY("hive.ql.rewrites.query.has.order.by", false), + QUERY_HAS_DISTRIBUTE_BY("hive.ql.rewrites.query.has.distribute.by", false), + QUERY_HAS_GROUP_BY("hive.ql.rewrites.query.has.group.by", false), + QUERY_HAS_DISTINCT("hive.ql.rewrites.query.has.distinct", false), //This still uses QBParseInfo to make decision. Needs to be changed if QB dependency is not desired. + AGG_FUNC_IS_NOT_COUNT("hive.ql.rewrites.agg.func.is.not.count", false), + AGG_FUNC_COLS_FETCH_EXCEPTION("hive.ql.rewrites.agg.func.cols.fetch.exception", false), + WHR_CLAUSE_COLS_FETCH_EXCEPTION("hive.ql.rewrites.whr.clause.cols.fetch.exception", false), + SEL_CLAUSE_COLS_FETCH_EXCEPTION("hive.ql.rewrites.sel.clause.cols.fetch.exception", false), + GBY_KEYS_FETCH_EXCEPTION("hive.ql.rewrites.gby.keys.fetch.exception", false), + COUNT_ON_ALL_COLS("hive.ql.rewrites.count.on.all.cols", false), + QUERY_HAS_GENERICUDF_ON_GROUPBY_KEY("hive.ql.rewrites.query.has.genericudf.on.groupby.key", false), + QUERY_HAS_MULTIPLE_TABLES("hive.ql.rewrites.query.has.multiple.tables", false), + SHOULD_APPEND_SUBQUERY("hive.ql.rewrites.should.append.subquery", false), + REMOVE_GROUP_BY("hive.ql.rewrites.remove.group.by", false); + ; + + public final String varname; + public final int defaultIntVal; + public final boolean defaultBoolVal; + public final Class valClass; + + //Constructors for int and boolean values + RewriteVars(String varname, int defaultIntVal) { + this.varname = varname; + this.valClass = Integer.class; + this.defaultIntVal = defaultIntVal; + this.defaultBoolVal = false; + } + + RewriteVars(String varname, boolean defaultBoolVal) { + this.varname = varname; + this.valClass = Boolean.class; + this.defaultIntVal = -1; + this.defaultBoolVal = defaultBoolVal; + } + + @Override + public String toString() { + return varname; + } + + + + } + + /* + * Methods to set and retrieve the RewriteVars enum variables + * */ + public int getIntVar(Configuration conf, RewriteVars var) { + assert (var.valClass == Integer.class); + return conf.getInt(var.varname, var.defaultIntVal); + } + + public void setIntVar(Configuration conf, RewriteVars var, int val) { + assert (var.valClass == Integer.class); + conf.setInt(var.varname, val); + } + + public boolean getBoolVar(Configuration conf, RewriteVars var) { + assert (var.valClass == Boolean.class); + return conf.getBoolean(var.varname, var.defaultBoolVal); + } + + public void setBoolVar(Configuration conf, RewriteVars var, boolean val) { + assert (var.valClass == Boolean.class); + conf.setBoolean(var.varname, val); + } + + public void initRewriteVars(){ + setIntVar(hiveConf, RewriteVars.AGG_FUNC_CNT,0); + setIntVar(hiveConf, RewriteVars.GBY_KEY_CNT,0); + setBoolVar(hiveConf, RewriteVars.QUERY_HAS_SORT_BY, false); + setBoolVar(hiveConf, RewriteVars.QUERY_HAS_ORDER_BY, false); + setBoolVar(hiveConf, RewriteVars.QUERY_HAS_DISTRIBUTE_BY, false); + setBoolVar(hiveConf, RewriteVars.QUERY_HAS_GROUP_BY, false); + setBoolVar(hiveConf, RewriteVars.QUERY_HAS_DISTINCT, false); + setBoolVar(hiveConf, RewriteVars.AGG_FUNC_IS_NOT_COUNT, false); + setBoolVar(hiveConf, RewriteVars.AGG_FUNC_COLS_FETCH_EXCEPTION, false); + setBoolVar(hiveConf, RewriteVars.WHR_CLAUSE_COLS_FETCH_EXCEPTION, false); + setBoolVar(hiveConf, RewriteVars.SEL_CLAUSE_COLS_FETCH_EXCEPTION, false); + setBoolVar(hiveConf, RewriteVars.GBY_KEYS_FETCH_EXCEPTION, false); + setBoolVar(hiveConf, RewriteVars.COUNT_ON_ALL_COLS, false); + setBoolVar(hiveConf, RewriteVars.QUERY_HAS_GENERICUDF_ON_GROUPBY_KEY, false); + setBoolVar(hiveConf, RewriteVars.QUERY_HAS_MULTIPLE_TABLES, false); + setBoolVar(hiveConf, RewriteVars.SHOULD_APPEND_SUBQUERY, false); + setBoolVar(hiveConf, RewriteVars.REMOVE_GROUP_BY, false); + } + + + + + //Data structures that are populated in the RewriteCanApplyProcFactory methods to check if the index key meets all criteria + Set selectColumnsList = new LinkedHashSet(); + Set predicateColumnsList = new LinkedHashSet(); + Set gbKeyNameList = new LinkedHashSet(); + Set aggFuncColList = new LinkedHashSet(); + + private final HiveConf hiveConf; + private int aggFuncCnt = 0; + private final ParseContext parseContext; + private String baseTableName = ""; + + void resetCanApplyCtx(){ + aggFuncCnt = 0; + selectColumnsList.clear(); + predicateColumnsList.clear(); + gbKeyNameList.clear(); + aggFuncColList.clear(); + baseTableName = ""; + } + + public Set getSelectColumnsList() { + return selectColumnsList; + } + + public void setSelectColumnsList(Set selectColumnsList) { + this.selectColumnsList = selectColumnsList; + } + + public Set getPredicateColumnsList() { + return predicateColumnsList; + } + + public void setPredicateColumnsList(Set predicateColumnsList) { + this.predicateColumnsList = predicateColumnsList; + } + + public Set getGbKeyNameList() { + return gbKeyNameList; + } + + public void setGbKeyNameList(Set gbKeyNameList) { + this.gbKeyNameList = gbKeyNameList; + } + + public Set getAggFuncColList() { + return aggFuncColList; + } + + public void setAggFuncColList(Set aggFuncColList) { + this.aggFuncColList = aggFuncColList; + } + + public HiveConf getConf() { + return hiveConf; + } + + public int getAggFuncCnt() { + return aggFuncCnt; + } + + public void setAggFuncCnt(int aggFuncCnt) { + this.aggFuncCnt = aggFuncCnt; + } + + public String getBaseTableName() { + return baseTableName; + } + + public void setBaseTableName(String baseTableName) { + this.baseTableName = baseTableName; + } + + public ParseContext getParseContext() { + return parseContext; + } + + + /** + * This method walks all the nodes starting from topOp TableScanOperator node + * and invokes methods from {@link RewriteCanApplyProcFactory} for each of the rules + * added to the opRules map. We use the {@link DefaultGraphWalker} for a post-order + * traversal of the operator tree. + * + * The methods from {@link RewriteCanApplyProcFactory} set appropriate values in + * {@link RewriteVars} enum. + * + * @param topOp + */ + void populateRewriteVars(Operator topOp){ + Map opRules = new LinkedHashMap(); + opRules.put(new RuleRegExp("R1", "FIL%"), RewriteCanApplyProcFactory.canApplyOnFilterOperator()); + opRules.put(new RuleRegExp("R2", "GBY%"), RewriteCanApplyProcFactory.canApplyOnGroupByOperator()); + opRules.put(new RuleRegExp("R3", "RS%OP%"), RewriteCanApplyProcFactory.canApplyOnExtractOperator()); + opRules.put(new RuleRegExp("R4", "SEL%"), RewriteCanApplyProcFactory.canApplyOnSelectOperator()); + + // The dispatcher fires the processor corresponding to the closest matching + // rule and passes the context along + Dispatcher disp = new DefaultRuleDispatcher(getDefaultProc(), opRules, this); + GraphWalker ogw = new PreOrderWalker(disp); + + // Create a list of topop nodes + ArrayList topNodes = new ArrayList(); + topNodes.add(topOp); + + try { + ogw.startWalking(topNodes, null); + } catch (SemanticException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + + } + + + /** + * Default procedure for {@link DefaultRuleDispatcher} + * @return + */ + private NodeProcessor getDefaultProc() { + return new NodeProcessor() { + @Override + public Object process(Node nd, Stack stack, + NodeProcessorCtx procCtx, Object... nodeOutputs) throws SemanticException { + return null; + } + }; + } + + + //Map for base table to index table mapping + //TableScan operator for base table will be modified to read from index table + private final HashMap baseToIdxTableMap = new HashMap();; + + + public void addTable(String baseTableName, String indexTableName) { + baseToIdxTableMap.put(baseTableName, indexTableName); + } + + public String findBaseTable(String baseTableName) { + return baseToIdxTableMap.get(baseTableName); + } + + + boolean isIndexUsableForQueryBranchRewrite(Index index, Set indexKeyNames){ + boolean removeGroupBy = true; + boolean optimizeCount = false; + + //-------------------------------------------- + //Check if all columns in select list are part of index key columns + if (!indexKeyNames.containsAll(selectColumnsList)) { + LOG.info("Select list has non index key column : " + + " Cannot use index " + index.getIndexName()); + return false; + } + + //-------------------------------------------- + // Check if all columns in where predicate are part of index key columns + // TODO: Currently we allow all predicates , would it be more efficient + // (or at least not worse) to read from index_table and not from baseTable? + if (!indexKeyNames.containsAll(predicateColumnsList)) { + LOG.info("Predicate column ref list has non index key column : " + + " Cannot use index " + index.getIndexName()); + return false; + } + + //-------------------------------------------- + // For group by, we need to check if all keys are from index columns + // itself. Here GB key order can be different than index columns but that does + // not really matter for final result. + // E.g. select c1, c2 from src group by c2, c1; + // we can rewrite this one to: + // select c1, c2 from src_cmpt_idx; + if (!indexKeyNames.containsAll(gbKeyNameList)) { + LOG.info("Group by key has some non-indexed columns, " + + " Cannot use index " + index.getIndexName()); + return false; + } + + // FUTURE: See if this can be relaxed. + // If we have agg function (currently only COUNT is supported), check if its input are + // from index. we currently support only that. + if (aggFuncColList.size() > 0) { + if (indexKeyNames.containsAll(aggFuncColList) == false) { + LOG.info("Agg Func input is not present in index key columns. Currently " + + "only agg func on index columns are supported by rewrite optimization" ); + return false; + } + // If we have count on some key, check if key is same as index key, + if (aggFuncColList.containsAll(indexKeyNames)) { + optimizeCount = true; + } + } + + if (!gbKeyNameList.containsAll(indexKeyNames)) { + // GB key and idx key are not same, don't remove GroupBy, but still do index scan + LOG.info("Index has some non-groupby columns, GroupBy will be" + + " preserved by rewrite optimization but original table scan" + + " will be replaced with index table scan." ); + removeGroupBy = false; + } + + // This check prevents to remove GroupBy for cases where the GROUP BY key cols are + // not simple expressions i.e. simple index key cols (in any order), but some + // expressions on the the key cols. + // e.g. + // 1. GROUP BY key, f(key) + // FUTURE: If f(key) output is functionally dependent on key, then we should support + // it. However we don't have mechanism/info about f() yet to decide that. + // 2. GROUP BY idxKey, 1 + // FUTURE: GB Key has literals along with idxKeyCols. Develop a rewrite to eliminate the + // literals from GB key. + // 3. GROUP BY idxKey, idxKey + // FUTURE: GB Key has dup idxKeyCols. Develop a rewrite to eliminate the dup key cols + // from GB key. + if (getBoolVar(hiveConf, RewriteVars.QUERY_HAS_GROUP_BY) && + indexKeyNames.size() < getIntVar(hiveConf, RewriteVars.GBY_KEY_CNT)) { + LOG.info("Group by key has some non-indexed columns, GroupBy will be" + + " preserved by rewrite optimization" ); + removeGroupBy = false; + } + + + //Now that we are good to do this optimization, set parameters in context + //which would be used by transformation procedure as inputs. + + //sub-query is needed only in case of optimizecount and complex gb keys? + if(getBoolVar(hiveConf, RewriteVars.QUERY_HAS_GENERICUDF_ON_GROUPBY_KEY) == false + && !(optimizeCount == true && removeGroupBy == false) ) { + setBoolVar(hiveConf, RewriteVars.REMOVE_GROUP_BY, removeGroupBy); + addTable(baseTableName, index.getIndexTableName()); + }else if(getBoolVar(hiveConf, RewriteVars.QUERY_HAS_GENERICUDF_ON_GROUPBY_KEY) == true && + getIntVar(hiveConf, RewriteVars.AGG_FUNC_CNT) == 1 && + getBoolVar(hiveConf, RewriteVars.AGG_FUNC_IS_NOT_COUNT) == false){ + setBoolVar(hiveConf, RewriteVars.SHOULD_APPEND_SUBQUERY, true); + addTable(baseTableName, index.getIndexTableName()); + }else{ + LOG.info("No valid criteria met to apply rewrite." ); + return false; + } + + return true; + } + + +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/RewriteCanApplyProcFactory.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/RewriteCanApplyProcFactory.java new file mode 100644 index 0000000..ff8d90f --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/RewriteCanApplyProcFactory.java @@ -0,0 +1,308 @@ +package org.apache.hadoop.hive.ql.optimizer; + +import java.io.Serializable; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.Stack; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hive.ql.exec.ColumnInfo; +import org.apache.hadoop.hive.ql.exec.ExtractOperator; +import org.apache.hadoop.hive.ql.exec.FileSinkOperator; +import org.apache.hadoop.hive.ql.exec.FilterOperator; +import org.apache.hadoop.hive.ql.exec.GroupByOperator; +import org.apache.hadoop.hive.ql.exec.Operator; +import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator; +import org.apache.hadoop.hive.ql.exec.RowSchema; +import org.apache.hadoop.hive.ql.exec.SelectOperator; +import org.apache.hadoop.hive.ql.lib.Node; +import org.apache.hadoop.hive.ql.lib.NodeProcessor; +import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx; +import org.apache.hadoop.hive.ql.optimizer.RewriteCanApplyCtx.RewriteVars; +import org.apache.hadoop.hive.ql.parse.ASTNode; +import org.apache.hadoop.hive.ql.parse.HiveParser; +import org.apache.hadoop.hive.ql.parse.QBParseInfo; +import org.apache.hadoop.hive.ql.parse.SemanticException; +import org.apache.hadoop.hive.ql.plan.AggregationDesc; +import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc; +import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; +import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc; +import org.apache.hadoop.hive.ql.plan.FilterDesc; +import org.apache.hadoop.hive.ql.plan.GroupByDesc; +import org.apache.hadoop.hive.ql.plan.ReduceSinkDesc; + +/** + * Factory of methods used by {@link RewriteGBUsingIndex} (see checkEachDAGOperator(..) method) + * to determine if the rewrite optimization can be applied to the input query + * + */ +public final class RewriteCanApplyProcFactory { + protected final static Log LOG = LogFactory.getLog(RewriteCanApplyProcFactory.class.getName()); + private static RewriteCanApplyCtx canApplyCtx = null; + + private RewriteCanApplyProcFactory(){ + //this prevents the class from getting instantiated + } + + + /** + * Check for conditions in FilterOperator that do not meet rewrite criteria. + * Set the appropriate variables in {@link RewriteVars} enum. + */ + private static class CheckFilterProc implements NodeProcessor { + public Object process(Node nd, Stack stack, NodeProcessorCtx ctx, + Object... nodeOutputs) throws SemanticException { + FilterOperator operator = (FilterOperator)nd; + canApplyCtx = (RewriteCanApplyCtx)ctx; + FilterDesc conf = (FilterDesc)operator.getConf(); + //The filter operator should have a predicate of ExprNodeGenericFuncDesc type. + //This represents the comparison operator + ExprNodeGenericFuncDesc oldengfd = (ExprNodeGenericFuncDesc) conf.getPredicate(); + if(oldengfd == null){ + canApplyCtx.setBoolVar(canApplyCtx.getConf(), RewriteVars.WHR_CLAUSE_COLS_FETCH_EXCEPTION, true); + //return false; + } + //The predicate should have valid left and right columns + List colList = oldengfd.getCols(); + if(colList == null || colList.size() == 0){ + canApplyCtx.setBoolVar(canApplyCtx.getConf(), RewriteVars.WHR_CLAUSE_COLS_FETCH_EXCEPTION, true); + //return false; + } + //Add the predicate columns to RewriteCanApplyCtx's predColRefs list to check later + //if index keys contain all filter predicate columns and vice-a-versa + for (String col : colList) { + canApplyCtx.getPredicateColumnsList().add(col); + } + + return null; + } + } + + public static CheckFilterProc canApplyOnFilterOperator() { + return new CheckFilterProc(); + } + + + + /** + * Check for conditions in GroupByOperator that do not meet rewrite criteria. + * Set the appropriate variables in {@link RewriteVars} enum. + * + */ + private static class CheckGroupByProc implements NodeProcessor { + public Object process(Node nd, Stack stack, NodeProcessorCtx ctx, + Object... nodeOutputs) throws SemanticException { + GroupByOperator operator = (GroupByOperator)nd; + canApplyCtx = (RewriteCanApplyCtx)ctx; + //for each group-by clause in query, only one GroupByOperator of the GBY-RS-GBY sequence is stored in getGroupOpToInputTables + //we need to process only this operator + //Also, we do not rewrite for cases when same query branch has multiple group-by constructs + if(canApplyCtx.getParseContext().getGroupOpToInputTables().containsKey(operator) && + canApplyCtx.getBoolVar(canApplyCtx.getConf(), RewriteVars.QUERY_HAS_GROUP_BY) == false ){ + + canApplyCtx.setBoolVar(canApplyCtx.getConf(), RewriteVars.QUERY_HAS_GROUP_BY, true); + + GroupByDesc conf = (GroupByDesc) operator.getConf(); + ArrayList aggrList = conf.getAggregators(); + if(aggrList != null && aggrList.size() > 0){ + for (AggregationDesc aggregationDesc : aggrList) { + int aggCnt = canApplyCtx.getAggFuncCnt(); + canApplyCtx.setIntVar(canApplyCtx.getConf(), RewriteVars.AGG_FUNC_CNT, aggCnt + 1); + canApplyCtx.setAggFuncCnt(aggCnt + 1); + + //In the current implementation, we do not support more than 1 agg funcs in group-by + if(canApplyCtx.getIntVar(canApplyCtx.getConf(), RewriteVars.AGG_FUNC_CNT) > 1) { + return false; + } + String aggFunc = aggregationDesc.getGenericUDAFName(); + if(!aggFunc.equals("count")){ + canApplyCtx.setBoolVar(canApplyCtx.getConf(), RewriteVars.AGG_FUNC_IS_NOT_COUNT, true); + //return false; + }else{ + ArrayList para = aggregationDesc.getParameters(); + //for a valid aggregation, it needs to have non-null parameter list + if(para == null){ + canApplyCtx.setBoolVar(canApplyCtx.getConf(), RewriteVars.AGG_FUNC_COLS_FETCH_EXCEPTION, true); + //return false; + }else if(para.size() == 0){ + //count(*) case + canApplyCtx.setBoolVar(canApplyCtx.getConf(), RewriteVars.COUNT_ON_ALL_COLS, true); + //return false; + }else{ + for(int i=0; i< para.size(); i++){ + ExprNodeDesc end = para.get(i); + if(end instanceof ExprNodeColumnDesc){ + //Add the columns to RewriteCanApplyCtx's selectColumnsList list to check later + //if index keys contain all select clause columns and vice-a-versa + //we get the select column 'actual' names only here if we have a agg func along with groub-by + //SelectOperator has internal names in its colList data structure + canApplyCtx.getSelectColumnsList().add(((ExprNodeColumnDesc) end).getColumn()); + + //Add the columns to RewriteCanApplyCtx's aggFuncColList list to check later + //if columns contained in agg func are index key columns + canApplyCtx.getAggFuncColList().add(((ExprNodeColumnDesc) end).getColumn()); + } + } + } + } + } + }else{ + //if group-by does not have aggregation list, then it "might" be a DISTINCT case + //this code uses query block to determine if the ASTNode tree contains the distinct TOK_SELECTDI token + QBParseInfo qbParseInfo = canApplyCtx.getParseContext().getQB().getParseInfo(); + Set clauseNameSet = qbParseInfo.getClauseNames(); + if (clauseNameSet.size() == 1) { + Iterator clauseNameIter = clauseNameSet.iterator(); + String clauseName = clauseNameIter.next(); + ASTNode rootSelExpr = qbParseInfo.getSelForClause(clauseName); + boolean isDistinct = (rootSelExpr.getType() == HiveParser.TOK_SELECTDI); + if(isDistinct) { + canApplyCtx.setBoolVar(canApplyCtx.getConf(), RewriteVars.QUERY_HAS_DISTINCT, true); + } + } + } + + + //we need to have non-null groub-by keys for a valid groub-by operator + ArrayList keyList = conf.getKeys(); + if(keyList == null || keyList.size() == 0){ + canApplyCtx.setBoolVar(canApplyCtx.getConf(), RewriteVars.GBY_KEYS_FETCH_EXCEPTION, true); + //return false; + } + + //sets the no. of keys in groub by to be used later to determine is group-by has non-index cols + //group-by needs to be preserved in such cases (eg.group-by using a function on index key. This is the subquery append case) + canApplyCtx.setIntVar(canApplyCtx.getConf(), RewriteVars.GBY_KEY_CNT, keyList.size()); + for (ExprNodeDesc exprNodeDesc : keyList) { + if(exprNodeDesc instanceof ExprNodeColumnDesc){ + //Add the group-by keys to RewriteCanApplyCtx's gbKeyNameList list to check later + //if all keys are from index columns + canApplyCtx.getGbKeyNameList().addAll(exprNodeDesc.getCols()); + }else if(exprNodeDesc instanceof ExprNodeGenericFuncDesc){ + ExprNodeGenericFuncDesc endfg = (ExprNodeGenericFuncDesc)exprNodeDesc; + List childExprs = endfg.getChildExprs(); + for (ExprNodeDesc end : childExprs) { + if(end instanceof ExprNodeColumnDesc){ + //Set QUERY_HAS_GENERICUDF_ON_GROUPBY_KEY to true which is used later to determine + //whether the rewrite is a 'append subquery' case + //this is true in case the group-by key is a GenericUDF like year,month etc + canApplyCtx.setBoolVar(canApplyCtx.getConf(), RewriteVars.QUERY_HAS_GENERICUDF_ON_GROUPBY_KEY, true); + canApplyCtx.getGbKeyNameList().addAll(exprNodeDesc.getCols()); + canApplyCtx.getSelectColumnsList().add(((ExprNodeColumnDesc) end).getColumn()); + } + } + } + } + + } + + return null; + } + } + + public static CheckGroupByProc canApplyOnGroupByOperator() { + return new CheckGroupByProc(); + } + + + /** + * Check for conditions in ExtractOperator that do not meet rewrite criteria. + * Set the appropriate variables in {@link RewriteVars} enum. + * + */ + private static class CheckExtractProc implements NodeProcessor { + public Object process(Node nd, Stack stack, NodeProcessorCtx ctx, + Object... nodeOutputs) throws SemanticException { + ExtractOperator operator = (ExtractOperator)nd; + canApplyCtx = (RewriteCanApplyCtx)ctx; + //We get the information whether query has SORT BY, ORDER BY, DISTRIBUTE BY from + //the parent ReduceSinkOperator of the current ExtractOperator + if(operator.getParentOperators() != null && operator.getParentOperators().size() >0){ + Operator interim = operator.getParentOperators().get(0); + if(interim instanceof ReduceSinkOperator){ + ReduceSinkDesc conf = (ReduceSinkDesc) interim.getConf(); + ArrayList partCols = conf.getPartitionCols(); + int nr = conf.getNumReducers(); + if(nr == -1){ + if(partCols != null && partCols.size() > 0){ + //query has distribute-by is there are non-zero partition columns + canApplyCtx.setBoolVar(canApplyCtx.getConf(), RewriteVars.QUERY_HAS_DISTRIBUTE_BY, true); + //return false; + }else{ + //we do not need partition columns in case of sort-by + canApplyCtx.setBoolVar(canApplyCtx.getConf(), RewriteVars.QUERY_HAS_SORT_BY, true); + //return false; + } + }else if(nr == 1){ + //Query has order-by only if number of reducers is 1 + canApplyCtx.setBoolVar(canApplyCtx.getConf(), RewriteVars.QUERY_HAS_ORDER_BY, true); + //return false; + } + + } + } + + return null; + } + } + + public static CheckExtractProc canApplyOnExtractOperator() { + return new CheckExtractProc(); + } + + /** + * Check for conditions in SelectOperator that do not meet rewrite criteria. + * Set the appropriate variables in {@link RewriteVars} enum. + * + */ + private static class CheckSelectProc implements NodeProcessor { + public Object process(Node nd, Stack stack, NodeProcessorCtx ctx, + Object... nodeOutputs) throws SemanticException { + SelectOperator operator = (SelectOperator)nd; + canApplyCtx = (RewriteCanApplyCtx)ctx; + + List> childrenList = operator.getChildOperators(); + Operator child = childrenList.get(0); + if(child instanceof FileSinkOperator){ + Map internalToAlias = new LinkedHashMap(); + RowSchema rs = operator.getSchema(); + //to get the internal to alias mapping + ArrayList sign = rs.getSignature(); + for (ColumnInfo columnInfo : sign) { + internalToAlias.put(columnInfo.getInternalName(), columnInfo.getAlias()); + //Add the columns to RewriteCanApplyCtx's selectColumnsList list to check later + //if index keys contain all select clause columns and vice-a-versa +/* if(!columnInfo.getAlias().startsWith("_c")){ + canApplyCtx.getSelectColumnsList().add(columnInfo.getAlias()); + } +*/ } + + //if FilterOperator predicate has internal column names, we need to retrieve the 'actual' column names to + //check if index keys contain all filter predicate columns and vice-a-versa + Iterator predItr = canApplyCtx.getPredicateColumnsList().iterator(); + while(predItr.hasNext()){ + String predCol = predItr.next(); + String newPredCol = ""; + if(predCol.startsWith("_c") && internalToAlias.get(predCol) != null){ + newPredCol = internalToAlias.get(predCol); + canApplyCtx.getPredicateColumnsList().remove(predCol); + canApplyCtx.getPredicateColumnsList().add(newPredCol); + } + } + } + return null; + } + } + + public static CheckSelectProc canApplyOnSelectOperator() { + return new CheckSelectProc(); + } + + + +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/RewriteGBUsingIndex.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/RewriteGBUsingIndex.java new file mode 100644 index 0000000..b8e7b5b --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/RewriteGBUsingIndex.java @@ -0,0 +1,509 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.optimizer; + +import java.io.Serializable; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.LinkedHashMap; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.metastore.api.FieldSchema; +import org.apache.hadoop.hive.metastore.api.Index; +import org.apache.hadoop.hive.metastore.api.StorageDescriptor; +import org.apache.hadoop.hive.ql.exec.Operator; +import org.apache.hadoop.hive.ql.exec.TableScanOperator; +import org.apache.hadoop.hive.ql.metadata.Hive; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.metadata.Table; +import org.apache.hadoop.hive.ql.optimizer.RewriteCanApplyCtx.RewriteVars; +import org.apache.hadoop.hive.ql.parse.ParseContext; +import org.apache.hadoop.hive.ql.parse.QBParseInfo; +import org.apache.hadoop.hive.ql.parse.SemanticException; + + +/** + * RewriteGBUsingIndex is implemented as one of the Rule-based Optimizations. + * Implements optimizations for GroupBy clause rewrite using compact index. + * This optimization rewrites GroupBy query over base table to the query over simple table-scan over + * index table, if there is index on the group by key(s) or the distinct column(s). + * E.g. + * + * select key + * from table + * group by key; + * + * to + * + * select key + * from idx_table; + * + * + * The rewrite supports following queries + * - Queries having only those col refs that are in the index key. + * - Queries that have index key col refs + * - in SELECT + * - in WHERE + * - in GROUP BY + * - Queries with agg func COUNT(literal) or COUNT(index key col ref) + * in SELECT + * - Queries with SELECT DISTINCT index_key_col_refs + * - Queries having a subquery satisfying above condition (only the + * subquery is rewritten) + * + * FUTURE: + * - Many of the checks for above criteria rely on equivalence of expressions, + * but such framework/mechanism of expression equivalence isn't present currently or developed yet. + * This needs to be supported in order for better robust checks. This is critically important for + * correctness of a query rewrite system. + * - This code currently uses index types with org.apache.hadoop.hive.ql.index.compact.CompactIndexHandler. + * However, the CompactIndexHandler currently stores the distinct block offsets and not the row offsets. + * Use of this index type will give erroneous results to compute COUNT if the same key appears more + * than once within the same block. To address this issue, we plan to create a new index type in future. + * + * + * @see RewriteCanApplyCtx + * @see RewriteCanApplyProcFactory + * @see RewriteRemoveGroupbyCtx + * @see RewriteRemoveGroupbyProcFactory + * @see RewriteIndexSubqueryCtx + * @see RewriteIndexSubqueryProcFactory + * @see RewriteParseContextGenerator + * + */ +public class RewriteGBUsingIndex implements Transform { + private ParseContext parseContext; + private Hive hiveDb; + private HiveConf hiveConf; + protected final Log LOG = LogFactory.getLog(this.getClass().getName()); + + //Stores the list of top TableScanOperator names for which the rewrite can be applied and the action that needs to be performed for operator tree + //starting from this TableScanOperator + private final Map tsOpToProcess = new LinkedHashMap(); + + //Name of the current table on which rewrite is being performed + private String baseTableName = null; + private String indexTableName = null; + + /***************************************Index Validation Variables***************************************/ + //The SUPPORTED_INDEX_TYPE value will change when we implement a new index handler to retrieve correct result + // for count if the same key appears more than once within the same block + final String SUPPORTED_INDEX_TYPE = + "org.apache.hadoop.hive.ql.index.compact.CompactIndexHandler"; + final String COMPACT_IDX_BUCKET_COL = "_bucketname"; + final String COMPACT_IDX_OFFSETS_ARRAY_COL = "_offsets"; + + @Override + public ParseContext transform(ParseContext pctx) throws SemanticException { + parseContext = pctx; + hiveConf = parseContext.getConf(); + try { + hiveDb = Hive.get(hiveConf); + } catch (HiveException e) { + LOG.info("Exception in getting hive conf"); + e.printStackTrace(); + } + + + /* Check if the input query is internal query that inserts in table (eg. ALTER INDEX...REBUILD etc.) + * We do not apply optimization here. + * */ + if(isQueryInsertToTable()){ + return parseContext; + }else{ + /* Check if the input query passes all the tests to be eligible for a rewrite + * If yes, rewrite original query; else, return the current parseContext + * */ + if(shouldApplyOptimization()){ + LOG.debug("Rewriting Original Query."); + rewriteOriginalQuery(); + } + return parseContext; + } + + } + + /** + * Use Query block's parse {@link QBParseInfo} information to check if the input query + * is an internal SQL. + * If it is true, we do not apply this optimization. + * @return + */ + private boolean isQueryInsertToTable(){ + QBParseInfo qbParseInfo = parseContext.getQB().getParseInfo(); + return qbParseInfo.isInsertToTable(); + } + + /** + * We traverse the current operator tree to check for conditions in which the + * optimization cannot be applied. + * + * At the end, we check if all conditions have passed for rewrite. If yes, we + * determine if the the index is usable for rewrite. Else, we log the condition which + * did not meet the rewrite criterion. + * + * @return + */ + boolean shouldApplyOptimization(){ + boolean canApply = false; + if(ifQueryHasMultipleTables()){ + //We do not apply this optimization for this case as of now. + return false; + }else{ + /* + * This code iterates over each TableScanOperator from the topOps map from ParseContext. + * For each operator tree originating from this top TableScanOperator, we determine + * if the optimization can be applied. If yes, we add the name of the top table to + * the tsOpToProcess to apply rewrite later on. + * */ + HashMap topToTable = parseContext.getTopToTable(); + HashMap> topOps = parseContext.getTopOps(); + Iterator topOpItr = topToTable.keySet().iterator(); + while(topOpItr.hasNext()){ + //Context for checking if this optimization can be applied to the input query + RewriteCanApplyCtx canApplyCtx = RewriteCanApplyCtx.getInstance(parseContext, hiveConf); + + TableScanOperator topOp = topOpItr.next(); + Table table = topToTable.get(topOp); + baseTableName = table.getTableName(); + HashMap> indexTableMap = getIndexTableInfoForRewrite(topOp); + + if(indexTableMap != null){ + if(indexTableMap.size() == 0){ + LOG.info("No Valid Index Found to apply Rewrite, " + + "skipping " + getName() + " optimization" ); + } else if(indexTableMap.size() > 1){ + LOG.info("Table has multiple valid index tables to apply rewrite."); + }else{ + canApplyCtx.setBaseTableName(baseTableName); + canApplyCtx.populateRewriteVars(topOp); + + Iterator indexMapItr = indexTableMap.keySet().iterator(); + Index index = null; + while(indexMapItr.hasNext()){ + //we rewrite the original query using the first valid index encountered + //this can be changed if we have a better mechanism to decide which index will produce a better rewrite + index = indexMapItr.next(); + canApply = canApplyCtx.isIndexUsableForQueryBranchRewrite(index, indexTableMap.get(index)); + if(canApply){ + canApply = checkIfAllRewriteCriteriaIsMet(canApplyCtx); + break; + } + } + indexTableName = index.getIndexTableName(); + + if(canApply && topOps.containsValue(topOp)) { + Iterator topOpNamesItr = topOps.keySet().iterator(); + while(topOpNamesItr.hasNext()){ + String topOpName = topOpNamesItr.next(); + if(topOps.get(topOpName).equals(topOp)){ + tsOpToProcess.put(topOpName, canApplyCtx); + } + } + } + } + } + } + } + return canApply; + } + + + /** + * Method to rewrite the input query if all optimization criteria is passed. + * The method iterates over the tsOpToProcess {@link ArrayList} to apply the rewrites + * + * @throws SemanticException + */ + private void rewriteOriginalQuery() throws SemanticException{ + HashMap> topOpMap = parseContext.getTopOps(); + Iterator tsOpItr = tsOpToProcess.keySet().iterator(); + while(tsOpItr.hasNext()){ + baseTableName = tsOpItr.next(); + RewriteCanApplyCtx canApplyCtx = tsOpToProcess.get(baseTableName); + TableScanOperator topOp = (TableScanOperator) topOpMap.get(baseTableName); + + /* This part of the code checks if the 'REMOVE_GROUP_BY' value in RewriteVars enum is set to true. + * If yes, it sets the environment for the RewriteRemoveGroupbyCtx context and invokes + * method to apply rewrite by removing group by construct operators from the original operator tree. + * */ + if(canApplyCtx.getBoolVar(hiveConf, RewriteVars.REMOVE_GROUP_BY)){ + //Context for removing the group by construct operators from the operator tree + RewriteRemoveGroupbyCtx removeGbyCtx = RewriteRemoveGroupbyCtx.getInstance(parseContext, hiveDb, indexTableName); + removeGbyCtx.invokeRemoveGbyProc(topOp); + //Getting back new parseContext and new OpParseContext after GBY-RS-GBY is removed + parseContext = removeGbyCtx.getParseContext(); + parseContext.setOpParseCtx(removeGbyCtx.getOpc()); + LOG.info("Finished Group by Remove"); + } + + /* This part of the code checks if the 'SHOULD_APPEND_SUBQUERY' value in RewriteVars enum is set to true. + * If yes, it sets the environment for the RewriteIndexSubqueryCtx context and invokes + * method to append a new subquery that scans over the index table rather than the original table. + * We first create the subquery context, then copy the RowSchema/RowResolver from subquery to original operator tree. + * */ + if(canApplyCtx.getBoolVar(hiveConf, RewriteVars.SHOULD_APPEND_SUBQUERY)){ + //Context for appending a subquery to scan over the index table + RewriteIndexSubqueryCtx subqueryCtx = RewriteIndexSubqueryCtx.getInstance(parseContext, indexTableName, baseTableName, + canApplyCtx.getSelectColumnsList()); + subqueryCtx.createSubqueryContext(); + + HashMap subqTopOpMap = subqueryCtx.getSubqueryPctx().getTopToTable(); + Iterator subqTopOpItr = subqTopOpMap.keySet().iterator(); + TableScanOperator subqTopOp = null; + if(subqTopOpItr.hasNext()){ + subqTopOp = subqTopOpItr.next(); + subqueryCtx.invokeSubquerySelectSchemaProc(subqTopOp); + LOG.info("Finished Fetching subquery select schema"); + subqueryCtx.invokeFixAllOperatorSchemasProc(topOp); + } + + parseContext = subqueryCtx.getParseContext(); + LOG.info("Finished appending subquery"); + } + } + + LOG.info("Finished Rewriting query"); + + } + + private String getName() { + return "RewriteGBUsingIndex"; + } + + + /** + * This method logs the reason for which we cannot apply the rewrite optimization. + * @return + */ + boolean checkIfAllRewriteCriteriaIsMet(RewriteCanApplyCtx canApplyCtx){ + if (canApplyCtx.getBoolVar(hiveConf, RewriteVars.QUERY_HAS_DISTRIBUTE_BY)){ + LOG.info("Query has distributeby clause, " + + "that is not supported with " + getName() + " optimization" ); + return false; + } + if (canApplyCtx.getBoolVar(hiveConf, RewriteVars.QUERY_HAS_SORT_BY)){ + LOG.info("Query has sortby clause, " + + "that is not supported with " + getName() + " optimization" ); + return false; + } + if (canApplyCtx.getBoolVar(hiveConf, RewriteVars.QUERY_HAS_ORDER_BY)){ + LOG.info("Query has orderby clause, " + + "that is not supported with " + getName() + " optimization" ); + return false; + } + if (canApplyCtx.getIntVar(hiveConf, RewriteVars.AGG_FUNC_CNT) > 1 ){ + LOG.info("More than 1 agg funcs: " + + "Not supported by " + getName() + " optimization" ); + return false; + } + if (canApplyCtx.getBoolVar(hiveConf, RewriteVars.AGG_FUNC_IS_NOT_COUNT)){ + LOG.info("Agg func other than count is " + + "not supported by " + getName() + " optimization" ); + return false; + } + if (canApplyCtx.getBoolVar(hiveConf, RewriteVars.COUNT_ON_ALL_COLS)){ + LOG.info("Currently count function needs group by on key columns. This is a count(*) case., " + + "Cannot apply this " + getName() + " optimization" ); + return false; + } + if (canApplyCtx.getBoolVar(hiveConf, RewriteVars.AGG_FUNC_COLS_FETCH_EXCEPTION)){ + LOG.info("Got exception while locating child col refs " + + "of agg func, skipping " + getName() + " optimization" ); + return false; + } + if (canApplyCtx.getBoolVar(hiveConf, RewriteVars.WHR_CLAUSE_COLS_FETCH_EXCEPTION)){ + LOG.info("Got exception while locating child col refs for where clause, " + + "skipping " + getName() + " optimization" ); + return false; + } + if (canApplyCtx.getBoolVar(hiveConf, RewriteVars.SEL_CLAUSE_COLS_FETCH_EXCEPTION)){ + LOG.info("Got exception while locating child col refs for select list, " + + "skipping " + getName() + " optimization" ); + return false; + } + if (canApplyCtx.getBoolVar(hiveConf, RewriteVars.GBY_KEYS_FETCH_EXCEPTION)){ + LOG.info("Got exception while locating child col refs for GroupBy key, " + + "skipping " + getName() + " optimization" ); + return false; + } + return true; + } + + + + /** + * This block of code iterates over the topToTable map from ParseContext + * to determine if the query has a scan over multiple tables. + * @return + */ + boolean ifQueryHasMultipleTables(){ + HashMap topToTable = parseContext.getTopToTable(); + Iterator valuesItr = topToTable.values().iterator(); + Set tableNameSet = new HashSet(); + while(valuesItr.hasNext()){ + Table table = valuesItr.next(); + tableNameSet.add(table.getTableName()); + } + if(tableNameSet.size() > 1){ + LOG.info("Query has more than one table " + + "that is not supported with " + getName() + " optimization" ); + return true; + } + return false; + } + + + /** + * Given a base table meta data, and a list of index types for which we need to find a matching index, + * this method returns a list of matching index tables. + * @param baseTableMetaData + * @param matchIndexTypes + * @return + */ + List getIndexes(Table baseTableMetaData, List matchIndexTypes) { + List matchingIndexes = new ArrayList(); + List indexesOnTable = null; + + try { + short maxNumOfIndexes = 1024; // XTODO: Hardcoding. Need to know if + // there's a limit (and what is it) on + // # of indexes that can be created + // on a table. If not, why is this param + // required by metastore APIs? + indexesOnTable = baseTableMetaData.getAllIndexes(maxNumOfIndexes); + + } catch (HiveException e) { + return matchingIndexes; // Return empty list (trouble doing rewrite + // shouldn't stop regular query execution, + // if there's serious problem with metadata + // or anything else, it's assumed to be + // checked & handled in core hive code itself. + } + + for (int i = 0; i < indexesOnTable.size(); i++) { + Index index = null; + index = indexesOnTable.get(i); + // The handler class implies the type of the index (e.g. compact + // summary index would be: + // "org.apache.hadoop.hive.ql.index.compact.CompactIndexHandler"). + String indexType = index.getIndexHandlerClass(); + for (int j = 0; j < matchIndexTypes.size(); j++) { + if (indexType.equals(matchIndexTypes.get(j))) { + matchingIndexes.add(index); + break; + } + } + } + return matchingIndexes; + } + + + /** + * We retrieve the list of index tables on the current table (represented by the TableScanOperator) + * which can be used to apply rewrite on the original query + * and return if there are no index tables to be used for rewriting the input query. + * + * @param topOp + * @return + */ + HashMap> getIndexTableInfoForRewrite(TableScanOperator topOp) { + HashMap> indexTableMap = null; + TableScanOperator ts = (TableScanOperator) topOp; + Table tsTable = parseContext.getTopToTable().get(ts); + if (tsTable != null) { + List idxType = new ArrayList(); + idxType.add(SUPPORTED_INDEX_TYPE); + List indexTables = getIndexes(tsTable, idxType); + if (indexTables.size() == 0) { + LOG.info("Table " + baseTableName + " does not have compact index. " + + "Cannot apply " + getName() + " optimization" ); + }else{ + indexTableMap = populateIndexToKeysMap(indexTables); + } + } + return indexTableMap; + } + + + /** + * This code block iterates over indexes on the table and picks + * up the first index that satisfies the rewrite criteria. + * @param indexTables + * @return + */ + HashMap> populateIndexToKeysMap(List indexTables){ + Index index = null; + Hive hiveInstance = hiveDb; + HashMap> indexToKeysMap = new LinkedHashMap>(); + + for (int idxCtr = 0; idxCtr < indexTables.size(); idxCtr++) { + final Set indexKeyNames = new LinkedHashSet(); + index = indexTables.get(idxCtr); + + //Getting index key columns + StorageDescriptor sd = index.getSd(); + List idxColList = sd.getCols(); + for (FieldSchema fieldSchema : idxColList) { + indexKeyNames.add(fieldSchema.getName()); + } + + + // Check that the index schema is as expected. This code block should + // catch problems of this rewrite breaking when the CompactIndexHandler + // index is changed. + // This dependency could be better handled by doing init-time check for + // compatibility instead of this overhead for every rewrite invocation. + ArrayList idxTblColNames = new ArrayList(); + try { + Table idxTbl = hiveInstance.getTable(index.getDbName(), + index.getIndexTableName()); + for (FieldSchema idxTblCol : idxTbl.getCols()) { + idxTblColNames.add(idxTblCol.getName()); + } + } catch (HiveException e) { + LOG.info("Got exception while locating index table, " + + "skipping " + getName() + " optimization" ); + return indexToKeysMap; + } + assert(idxTblColNames.contains(COMPACT_IDX_BUCKET_COL)); + assert(idxTblColNames.contains(COMPACT_IDX_OFFSETS_ARRAY_COL)); + assert(idxTblColNames.size() == indexKeyNames.size() + 2); + + //we add all index tables which can be used for rewrite and defer the decision of using a particular index for later + //this is to allow choosing a index if a better mechanism is designed later to chose a better rewrite + indexToKeysMap.put(index, indexKeyNames); + } + return indexToKeysMap; + + } + + + + +} + diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/RewriteIndexSubqueryCtx.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/RewriteIndexSubqueryCtx.java new file mode 100644 index 0000000..433e528 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/RewriteIndexSubqueryCtx.java @@ -0,0 +1,298 @@ +package org.apache.hadoop.hive.ql.optimizer; + +import java.io.Serializable; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.Stack; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hive.ql.exec.ColumnInfo; +import org.apache.hadoop.hive.ql.exec.Operator; +import org.apache.hadoop.hive.ql.lib.DefaultGraphWalker; +import org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher; +import org.apache.hadoop.hive.ql.lib.Dispatcher; +import org.apache.hadoop.hive.ql.lib.GraphWalker; +import org.apache.hadoop.hive.ql.lib.Node; +import org.apache.hadoop.hive.ql.lib.NodeProcessor; +import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx; +import org.apache.hadoop.hive.ql.lib.PreOrderWalker; +import org.apache.hadoop.hive.ql.lib.Rule; +import org.apache.hadoop.hive.ql.lib.RuleRegExp; +import org.apache.hadoop.hive.ql.parse.ParseContext; +import org.apache.hadoop.hive.ql.parse.RowResolver; +import org.apache.hadoop.hive.ql.parse.SemanticException; +import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator; + +/** + * RewriteIndexSubqueryCtx class stores the context for the {@link RewriteIndexSubqueryProcFactory} processor factory methods + * + */ +public class RewriteIndexSubqueryCtx implements NodeProcessorCtx { + + private RewriteIndexSubqueryCtx(ParseContext parseContext, String indexTableName, + String baseTableName, Set selectColumnNames){ + //this prevents the class from getting instantiated + this.parseContext = parseContext; + this.indexName = indexTableName; + this.baseTableName = baseTableName; + this.selectColumnNames = selectColumnNames; + } + + public static RewriteIndexSubqueryCtx getInstance(ParseContext parseContext, String indexTableName, + String baseTableName, Set selectColumnNames){ + return new RewriteIndexSubqueryCtx(parseContext, indexTableName, baseTableName, selectColumnNames ); + } + protected final Log LOG = LogFactory.getLog(RewriteIndexSubqueryCtx.class.getName()); + + //This is populated in RewriteIndexSubqueryProcFactory's NewQuerySelectSchemaProc processor with the colExprMap of the + //SelectOperator whose parent is TableScanOperator + private Map newSelColExprMap = new LinkedHashMap(); + //The next two data structures are populated in RewriteIndexSubqueryProcFactory's NewQuerySelectSchemaProc processor + //with the colExprMap of the SelectOperator whose child is GroupByOperator + private final ArrayList newSelColList = new ArrayList(); + + // Initialise all data structures required to copy RowResolver, RowSchema, outputColumnNames, colList, colExprMap + //from subquery DAG to original DAG operators + private final ArrayList newOutputCols = new ArrayList(); + private Map newColExprMap = new HashMap(); + private final ArrayList newColList = new ArrayList(); + private final ArrayList newRS = new ArrayList(); + private RowResolver newRR = new RowResolver(); + + //This is populated in RewriteIndexSubqueryProcFactory's SubquerySelectSchemaProc processor for later + //use in NewQuerySelectSchemaProc processor + private final Map aliasToInternal = new LinkedHashMap(); + + // Get the parentOperators List for FileSinkOperator. We need this later to set the + // parentOperators for original DAG operator + private final List> subqFSParentList = new ArrayList>(); + + // We need the reference to this SelectOperator so that the original DAG can be appended here + private Operator subqSelectOp; + + //We replace the original TS operator with new TS operator from subquery context to scan over the index table + //rather than the original table + private Operator newTSOp; + + private final ParseContext parseContext; + private final Set selectColumnNames; + private final String indexName; + private final String baseTableName; + + private ParseContext subqueryPctx = null; + private ParseContext newDAGCtx = null; + + //We need the GenericUDAFEvaluator for GenericUDAF function "sum" when we append subquery to original operator tree + private GenericUDAFEvaluator eval = null; + + + public Set getSelectColumnNames() { + return selectColumnNames; + } + + public ArrayList getNewOutputCols() { + return newOutputCols; + } + + public Map getNewColExprMap() { + return newColExprMap; + } + + public void setNewColExprMap(Map newColExprMap) { + this.newColExprMap = newColExprMap; + } + + public ArrayList getNewColList() { + return newColList; + } + + public ArrayList getNewRS() { + return newRS; + } + + public RowResolver getNewRR() { + return newRR; + } + + public void setNewRR(RowResolver newRR) { + this.newRR = newRR; + } + + public List> getSubqFSParentList() { + return subqFSParentList; + } + + public Operator getSubqSelectOp() { + return subqSelectOp; + } + + public void setSubqSelectOp(Operator subqSelectOp) { + this.subqSelectOp = subqSelectOp; + } + + public Map getAliasToInternal() { + return aliasToInternal; + } + + public ParseContext getParseContext() { + return parseContext; + } + + public ParseContext getSubqueryPctx() { + return subqueryPctx; + } + + public void setSubqueryPctx(ParseContext subqueryPctx) { + this.subqueryPctx = subqueryPctx; + } + + public ParseContext getNewDAGCtx() { + return newDAGCtx; + } + + public void setNewDAGCtx(ParseContext newDAGCtx) { + this.newDAGCtx = newDAGCtx; + } + + public Map getNewSelColExprMap() { + return newSelColExprMap; + } + + public void setNewSelColExprMap(Map newSelColExprMap) { + this.newSelColExprMap = newSelColExprMap; + } + + public ArrayList getNewSelColList() { + return newSelColList; + } + + public String getIndexName() { + return indexName; + } + + public String getBaseTableName() { + return baseTableName; + } + + public GenericUDAFEvaluator getEval() { + return eval; + } + + public void setEval(GenericUDAFEvaluator eval) { + this.eval = eval; + } + + + public void setNewTSOp(Operator newTSOp) { + this.newTSOp = newTSOp; + } + + public Operator getNewTSOp() { + return newTSOp; + } + + /** + * We construct the string command for subquery using index key columns + * and use the {@link RewriteParseContextGenerator} to generate a operator tree + * and its ParseContext for the subquery string command + */ + void createSubqueryContext() { + String selKeys = ""; + for (String key : selectColumnNames) { + selKeys += key + ","; + } + String subqueryCommand = "select " + selKeys + " size(`_offsets`) as CNT from " + indexName; + subqueryPctx = RewriteParseContextGenerator.generateOperatorTree(parseContext.getConf(), subqueryCommand); + + } + + /** + * Walk the original operator tree using the {@link DefaultGraphWalker} using the rules. + * Each of the rules invoke respective methods from the {@link RewriteIndexSubqueryProcFactory} + * to + * @param topOp + * @throws SemanticException + */ + public void invokeSubquerySelectSchemaProc(Operator topOp) throws SemanticException{ + Map opRules = new LinkedHashMap(); + //removes the subquery FileSinkOperator from subquery OpParseContext as + //we do not need to append FS operator to original operator tree + opRules.put(new RuleRegExp("R1", "FS%"), RewriteIndexSubqueryProcFactory.getSubqueryFileSinkProc()); + //copies the RowSchema, outputColumnNames, colList, RowResolver, columnExprMap to RewriteIndexSubqueryCtx data structures + opRules.put(new RuleRegExp("R2", "SEL%"), RewriteIndexSubqueryProcFactory.getSubquerySelectSchemaProc()); + + // The dispatcher fires the processor corresponding to the closest matching + // rule and passes the context along + Dispatcher disp = new DefaultRuleDispatcher(getDefaultProc(), opRules, this); + GraphWalker ogw = new DefaultGraphWalker(disp); + + // Create a list of topop nodes + ArrayList topNodes = new ArrayList(); + topNodes.add(topOp); + ogw.startWalking(topNodes, null); + + } + + + + /** + * Walk the original operator tree using the {@link PreOrderWalker} using the rules. + * This method appends the subquery operator tree to original operator tree + * It replaces the original table scan operator with index table scan operator + * Method also copies the information from {@link RewriteIndexSubqueryCtx} to + * appropriate operators from the original operator tree + * @param topOp + * @throws SemanticException + */ + public void invokeFixAllOperatorSchemasProc(Operator topOp) throws SemanticException{ + Map opRules = new LinkedHashMap(); + + //appends subquery operator tree to original operator tree + opRules.put(new RuleRegExp("R1", "TS%"), RewriteIndexSubqueryProcFactory.getAppendSubqueryToOriginalQueryProc()); + + //copies RowSchema, outputColumnNames, colList, RowResolver, columnExprMap from RewriteIndexSubqueryCtx data structures + // to SelectOperator of original operator tree + opRules.put(new RuleRegExp("R2", "SEL%"), RewriteIndexSubqueryProcFactory.getNewQuerySelectSchemaProc()); + //Manipulates the ExprNodeDesc from FilterOperator predicate list as per colList data structure from RewriteIndexSubqueryCtx + opRules.put(new RuleRegExp("R3", "FIL%"), RewriteIndexSubqueryProcFactory.getNewQueryFilterSchemaProc()); + //Manipulates the ExprNodeDesc from GroupByOperator aggregation list, parameters list \ + //as per colList data structure from RewriteIndexSubqueryCtx + opRules.put(new RuleRegExp("R4", "GBY%"), RewriteIndexSubqueryProcFactory.getNewQueryGroupbySchemaProc()); + + // The dispatcher fires the processor corresponding to the closest matching + // rule and passes the context along + Dispatcher disp = new DefaultRuleDispatcher(getDefaultProc(), opRules, this); + GraphWalker ogw = new PreOrderWalker(disp); + + // Create a list of topop nodes + ArrayList topNodes = new ArrayList(); + topNodes.add(topOp); + + ogw.startWalking(topNodes, null); + + } + + + /** + * Default procedure for {@link DefaultRuleDispatcher} + * @return + */ + private NodeProcessor getDefaultProc() { + return new NodeProcessor() { + @Override + public Object process(Node nd, Stack stack, + NodeProcessorCtx procCtx, Object... nodeOutputs) throws SemanticException { + return null; + } + }; + } + + + +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/RewriteIndexSubqueryProcFactory.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/RewriteIndexSubqueryProcFactory.java new file mode 100644 index 0000000..989c6aa --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/RewriteIndexSubqueryProcFactory.java @@ -0,0 +1,605 @@ +package org.apache.hadoop.hive.ql.optimizer; + +import java.io.Serializable; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Iterator; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.Stack; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hive.ql.exec.ColumnInfo; +import org.apache.hadoop.hive.ql.exec.FileSinkOperator; +import org.apache.hadoop.hive.ql.exec.FilterOperator; +import org.apache.hadoop.hive.ql.exec.GroupByOperator; +import org.apache.hadoop.hive.ql.exec.Operator; +import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator; +import org.apache.hadoop.hive.ql.exec.RowSchema; +import org.apache.hadoop.hive.ql.exec.SelectOperator; +import org.apache.hadoop.hive.ql.exec.TableScanOperator; +import org.apache.hadoop.hive.ql.lib.Node; +import org.apache.hadoop.hive.ql.lib.NodeProcessor; +import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx; +import org.apache.hadoop.hive.ql.metadata.Table; +import org.apache.hadoop.hive.ql.parse.ParseContext; +import org.apache.hadoop.hive.ql.parse.RowResolver; +import org.apache.hadoop.hive.ql.parse.SemanticException; +import org.apache.hadoop.hive.ql.plan.AggregationDesc; +import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc; +import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; +import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc; +import org.apache.hadoop.hive.ql.plan.FilterDesc; +import org.apache.hadoop.hive.ql.plan.GroupByDesc; +import org.apache.hadoop.hive.ql.plan.SelectDesc; +import org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; + +/** + * Factory of processors used in {@link RewriteGBUsingIndex} (see invokeSubquerySelectSchemaProc(..) method) + * Each of the processors are invoked according to a rule and serve to append subquery to original operator tree. + * + * This subquery scans over the index table rather than the original table. + * IT replaces the count(literal)/count(index_key) function in the original select operator + * with sum(cnt) where cnt is size(_offsets) from subquery select operator. + * + * This change necessitates change in the rowSchema, colList, colExprMap, rowResolver of all the SelectOperator's in original + * operator tree. It also requires to set appropriate predicate parameters and group-by aggregation parameters in original + * operator tree. Each of the processors in this Factory take care of these changes. + * + */ +public final class RewriteIndexSubqueryProcFactory { + protected final static Log LOG = LogFactory.getLog(RewriteIndexSubqueryProcFactory.class.getName()); + private static RewriteIndexSubqueryCtx subqueryCtx = null; + + private RewriteIndexSubqueryProcFactory() { + //this prevents the class from getting instantiated + } + + /** + * This processor retrieves the rowSchema, rowResolver, colList, colExprMap and outputColumnNames data structures + * from the SelectOperator and its descriptor(SelectDesc). It stores the information in the RewriteIndexSubqueryCtx instance + * for later use in correcting the schema of original operator tree. + * + */ + private static class SubquerySelectSchemaProc implements NodeProcessor { + public Object process(Node nd, Stack stack, NodeProcessorCtx ctx, + Object... nodeOutputs) throws SemanticException { + SelectOperator operator = (SelectOperator)nd; + subqueryCtx = (RewriteIndexSubqueryCtx)ctx; + + //We need to clear this every time in cases where there are multiple operator tree paths with multiple SelectOperators + subqueryCtx.getNewOutputCols().clear(); + subqueryCtx.getNewColExprMap().clear(); + subqueryCtx.getNewColList().clear(); + subqueryCtx.getNewRS().clear(); + subqueryCtx.setNewRR(new RowResolver()); + + + RowResolver oldRR = subqueryCtx.getSubqueryPctx().getOpParseCtx().get(operator).getRowResolver(); + SelectDesc oldConf = (SelectDesc) operator.getConf(); + Map oldColumnExprMap = operator.getColumnExprMap(); + ArrayList oldColList = oldConf.getColList(); + + //We create the mapping of column name alias to internal name for later use in correcting original operator tree + ArrayList schemaSign = operator.getSchema().getSignature(); + for (ColumnInfo columnInfo : schemaSign) { + String internal = columnInfo.getInternalName(); + String alias = columnInfo.getAlias(); + subqueryCtx.getAliasToInternal().put(alias, internal); + } + + /**outputColumnNames**/ + String internalName = null; + for(int i=0; i < oldConf.getOutputColumnNames().size(); i++){ + internalName = oldConf.getOutputColumnNames().get(i); + //Populate all output columns (required by SelectOperators in original DAG) in RewriteIndexSubqueryCtx + subqueryCtx.getNewOutputCols().add(new String(internalName)); + + /**colExprMap**/ + if(oldColumnExprMap != null){ + ExprNodeDesc end = oldColumnExprMap.get(internalName); //in case of simple column names + if(end instanceof ExprNodeColumnDesc){ + ExprNodeColumnDesc oldDesc = (ExprNodeColumnDesc)end ; + ExprNodeColumnDesc newDesc = (ExprNodeColumnDesc) oldDesc.clone(); + newDesc.setColumn(internalName); + //Populate columnExprMap (required by SelectOperator and FilterOperator in original DAG) in RewriteIndexSubqueryCtx + subqueryCtx.getNewColExprMap().put(internalName, newDesc); + }else if(end instanceof ExprNodeGenericFuncDesc){ //in case of functions on columns + ExprNodeGenericFuncDesc oldDesc = (ExprNodeGenericFuncDesc)end ; + ExprNodeGenericFuncDesc newDesc = (ExprNodeGenericFuncDesc) oldDesc.clone(); + List childExprs = newDesc.getChildExprs(); + List newChildExprs = new ArrayList(); + for (ExprNodeDesc childEnd : childExprs) { //we have the list of columns here + if(childEnd instanceof ExprNodeColumnDesc){ + ((ExprNodeColumnDesc) childEnd).setColumn(internalName); + newChildExprs.add(childEnd); + } + newDesc.setChildExprs(newChildExprs); + //Populate columnExprMap (required by SelectOperator and FilterOperator in original DAG) in RewriteIndexSubqueryCtx + subqueryCtx.getNewColExprMap().put(internalName, newDesc); + } + } + } + + /**colList**/ + if(oldColList != null){ + ExprNodeDesc exprNodeDesc = oldColList.get(i); + if(exprNodeDesc instanceof ExprNodeColumnDesc){//in case of simple column names + ExprNodeColumnDesc newDesc = (ExprNodeColumnDesc) exprNodeDesc.clone(); + newDesc.setColumn(internalName); + //Populate colList (required by SelectOperators in original DAG) in RewriteIndexSubqueryCtx + subqueryCtx.getNewColList().add(newDesc); + }else if(exprNodeDesc instanceof ExprNodeGenericFuncDesc){//in case of functions on columns + ExprNodeGenericFuncDesc oldDesc = (ExprNodeGenericFuncDesc)exprNodeDesc ; + ExprNodeGenericFuncDesc newDesc = (ExprNodeGenericFuncDesc) oldDesc.clone(); + List childExprs = newDesc.getChildExprs(); + List newChildExprs = new ArrayList(); + for (ExprNodeDesc childEnd : childExprs) {//we have the list of columns here + if(childEnd instanceof ExprNodeColumnDesc){ + ((ExprNodeColumnDesc) childEnd).setColumn(internalName); + newChildExprs.add(childEnd); + } + newDesc.setChildExprs(newChildExprs); + //Populate colList (required by SelectOperators in original DAG) in RewriteIndexSubqueryCtx + subqueryCtx.getNewColList().add(newDesc); + } + } + } + } + + /**RowSchema and RowResolver**/ + for (int i = 0; i < subqueryCtx.getNewOutputCols().size(); i++) { + internalName = subqueryCtx.getNewOutputCols().get(i); + String[] nm = oldRR.reverseLookup(internalName); + ColumnInfo col; + try { + //We need to set the alias for the new index table subquery + col = oldRR.get(nm[0], nm[1]); + if(nm[0] == null){ + nm[0] = "v" + i; //add different alias in case original query has multiple subqueries + } + // Populate RowResolver and RowSchema (required by SelectOperator and FilterOperator in original DAG) in RewriteIndexSubqueryCtx + subqueryCtx.getNewRR().put(nm[0], nm[1], col); + subqueryCtx.getNewRS().add(col); + } catch (SemanticException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + //We need this SelectOperator from subquery as a reference point to append in original query + subqueryCtx.setSubqSelectOp(operator); + + return null; + } + } + + public static SubquerySelectSchemaProc getSubquerySelectSchemaProc(){ + return new SubquerySelectSchemaProc(); + } + + + /** + * We do not need the fileSinkOperator of the subquery operator tree when we append the rest of the subquery operator tree + * to the original operator tree. This processor gets rid of this FS operator by removing it from subquery OpParseContext. + * + */ + private static class SubqueryFileSinkProc implements NodeProcessor { + public Object process(Node nd, Stack stack, NodeProcessorCtx ctx, + Object... nodeOutputs) throws SemanticException { + FileSinkOperator operator = (FileSinkOperator)nd; + subqueryCtx = (RewriteIndexSubqueryCtx)ctx; + //Store the list of FileSinkOperator's parent operators as we later append the original query + //at the end of the subquery operator tree (without the FileSinkOperator). + subqueryCtx.getSubqFSParentList().addAll(operator.getParentOperators()); + subqueryCtx.getSubqueryPctx().getOpParseCtx().remove(operator); + return null; + } + } + + public static SubqueryFileSinkProc getSubqueryFileSinkProc(){ + return new SubqueryFileSinkProc(); + } + + /** + * This processor appends the subquery operator tree to the original operator tree. + * Since genPlan(..) method from the SemanticAnalyzer creates the operator tree bottom-up i.e. + * FROM-WHERE-GROUPBY-ORDERBY-SELECT etc, any query with nested subqueries will have the TableScanOperator of the + * innermost subquery as the top operator in the topOps and topToTable maps. + * + * Any subquery which is a part of the from clause + * (eg: SELECT * FROM (SELECT DISTINCT key, value FROM tbl) v1 WHERE v1.value = 2;) always has its + * DAG operator tree appended before the operator tree of the enclosing query. + * For example, for the above query, the operator tree is: + * SEL(1)[subq]--->GBY(2)[subq]--->RS(3)[subq]--->GBY(4)[subq]--->SEL(5)[subq]--->FIL(6)[orig]--->SEL(7)[orig]--->FS(8)[orig]> + * + * We replace the TableScanOperator (TS) of the original operator tree with the whole subquery operator tree (without the + * FileSinkOperator of the subquery operator tree). + * + */ + private static class AppendSubqueryToOriginalQueryProc implements NodeProcessor { + public Object process(Node nd, Stack stack, NodeProcessorCtx ctx, + Object... nodeOutputs) throws SemanticException { + TableScanOperator operator = (TableScanOperator)nd; + subqueryCtx = (RewriteIndexSubqueryCtx)ctx; + List> origChildrenList = operator.getChildOperators(); + + /* origChildrenList has the child operators for the TableScanOperator of the original DAG + * We need to get rid of the TS operator of original DAG and append rest of the tree to the sub-query operator DAG + * This code sets the parentOperators of first operator in origChildrenList to subqFSParentList. + * subqFSParentList contains the parentOperators list of the FileSinkOperator of the sub-query operator DAG + * + * subqLastOp is the last SelectOperator of sub-query DAG. The rest of the original operator DAG needs to be appended here + * Hence, set the subqLastOp's child operators to be origChildrenList + * + * */ + if(origChildrenList != null && origChildrenList.size() > 0){ + origChildrenList.get(0).setParentOperators(subqueryCtx.getSubqFSParentList()); + } + if(subqueryCtx.getSubqSelectOp() != null){ + subqueryCtx.getSubqSelectOp().setChildOperators(origChildrenList); + } + + /* The operator DAG plan is generated in the order FROM-WHERE-GROUPBY-ORDERBY-SELECT + * We have appended the original operator DAG at the end of the sub-query operator DAG + * as the sub-query will always be a part of FROM processing + * Now we need to insert the final sub-query+original DAG to the original ParseContext + */ + + HashMap> subqTopMap = subqueryCtx.getSubqueryPctx().getTopOps(); + Iterator subqTabItr = subqTopMap.keySet().iterator(); + String subqTab = subqTabItr.next(); + Operator subqOp = subqTopMap.get(subqTab); + Table tbl = subqueryCtx.getSubqueryPctx().getTopToTable().get(subqOp); + + //remove original TableScanOperator from the topToTable map + //Put the new TableScanOperator (top operator of the subquery operator tree) to topToTable map + subqueryCtx.getParseContext().getTopToTable().remove(operator); + subqueryCtx.getParseContext().getTopToTable().put((TableScanOperator) subqOp, tbl); + + String tabAlias = ""; + if(subqueryCtx.getBaseTableName().contains(":")){ + String[] tabToAlias = subqueryCtx.getBaseTableName().split(":"); + if(tabToAlias.length > 1){ + tabAlias = tabToAlias[0] + ":"; + } + } + //remove original table and operator tree mapping from topOps + //put the new table alias adn subquery index table as the key and the new operator tree as value in topOps + subqueryCtx.getParseContext().getTopOps().remove(subqueryCtx.getBaseTableName()); + subqueryCtx.getParseContext().getTopOps().put(tabAlias + subqTab, subqOp); + + //we need this later + subqueryCtx.setNewTSOp(subqOp); + + //remove original TableScanOperator from the original OpParsecontext + //add all values from the subquery OpParseContext to the original OpParseContext + subqueryCtx.getParseContext().getOpParseCtx().remove(operator); + subqueryCtx.getParseContext().getOpParseCtx().putAll(subqueryCtx.getSubqueryPctx().getOpParseCtx()); + LOG.info("Finished appending subquery"); + return null; + } + } + + public static AppendSubqueryToOriginalQueryProc getAppendSubqueryToOriginalQueryProc(){ + return new AppendSubqueryToOriginalQueryProc(); + } + + + + /** + * NewQuerySelectSchemaProc. + * + */ + private static class NewQuerySelectSchemaProc implements NodeProcessor { + public Object process(Node nd, Stack stack, NodeProcessorCtx ctx, + Object... nodeOutputs) throws SemanticException { + SelectOperator operator = (SelectOperator)nd; + subqueryCtx = (RewriteIndexSubqueryCtx)ctx; + + List> parentOps = operator.getParentOperators(); + Operator parentOp = parentOps.iterator().next(); + List> childOps = operator.getChildOperators(); + Operator childOp = childOps.iterator().next(); + + + if(parentOp instanceof TableScanOperator){ + //We need to copy the colExprMap of this SelectOperator whose parent is TableScanOperator to the + //colExprMap of the SelectOperator whose child operator is a GroupByOperator + subqueryCtx.setNewSelColExprMap(operator.getColumnExprMap()); + }else if((!(parentOp instanceof TableScanOperator)) //skip first SelectOperator in operator tree + && (!(childOp instanceof FileSinkOperator)) //skip last SelectOperator in operator tree + && (!(childOp instanceof ReduceSinkOperator))){ //skip the SelectOperator which appears before a JOIN in operator tree + + //Copy colList and outputColumns for SelectOperator from sub-query DAG SelectOperator + //these are all the SelectOperators that come in between the first SelectOperator and last SelectOperator in the operator tree + operator.setColumnExprMap(subqueryCtx.getNewColExprMap()); + subqueryCtx.getParseContext().getOpParseCtx().get(operator).setRowResolver(subqueryCtx.getNewRR()); + operator.getSchema().setSignature(subqueryCtx.getNewRS()); + SelectDesc conf = (SelectDesc) operator.getConf(); + conf.setColList(subqueryCtx.getNewColList()); + conf.setOutputColumnNames(subqueryCtx.getNewOutputCols()); + } + + if (childOp instanceof GroupByOperator){ + //use the original columnExprMap to construct the newColList + subqueryCtx.getNewSelColList().clear(); + /**colList**/ + Set internalNamesList = operator.getColumnExprMap().keySet(); + for (String internal : internalNamesList) { + ExprNodeDesc end = operator.getColumnExprMap().get(internal).clone(); + if(end instanceof ExprNodeGenericFuncDesc){ + List colExprs = ((ExprNodeGenericFuncDesc)end).getChildExprs(); + for (ExprNodeDesc colExpr : colExprs) { + if(colExpr instanceof ExprNodeColumnDesc){ + if(!subqueryCtx.getNewSelColList().contains(colExpr)){ + TypeInfo typeInfo = colExpr.getTypeInfo(); + if(typeInfo instanceof ListTypeInfo){ + PrimitiveTypeInfo pti = new PrimitiveTypeInfo(); + pti.setTypeName("int"); + colExpr.setTypeInfo(pti); + } + subqueryCtx.getNewSelColList().add(colExpr); + } + } + } + + }else if(end instanceof ExprNodeColumnDesc){ + if(!subqueryCtx.getNewSelColList().contains(end)){ + subqueryCtx.getNewSelColList().add(end); + } + } + } + //Set the new colExprMap and new colList + operator.setColumnExprMap(subqueryCtx.getNewSelColExprMap()); + SelectDesc selDesc = (SelectDesc) operator.getConf(); + selDesc.setColList(subqueryCtx.getNewSelColList()); + } + + return null; + } + } + + public static NewQuerySelectSchemaProc getNewQuerySelectSchemaProc(){ + return new NewQuerySelectSchemaProc(); + } + + + /** + * We need to replace the count(literal) GenericUDAF aggregation function for group-by construct to "sum" GenericUDAF. + * This processor creates a new operator tree for a sample query that creates a GroupByOperator with sum aggregation function + * and uses that GroupByOperator information to replace the original GroupByOperator aggregation information. + * It replaces the AggregationDesc (aggregation descriptor) of the old GroupByOperator with the new Aggregation Desc + * of the new GroupByOperator. + * + * The processor also corrects the RowSchema and group-by keys by replacing the existing internal names with the new internal names. + * This change is required as we add a new subquery to the original query which triggers this change. + * + */ + private static class NewQueryGroupbySchemaProc implements NodeProcessor { + public Object process(Node nd, Stack stack, NodeProcessorCtx ctx, + Object... nodeOutputs) throws SemanticException { + GroupByOperator operator = (GroupByOperator)nd; + subqueryCtx = (RewriteIndexSubqueryCtx)ctx; + + //We need to replace the GroupByOperator which is in groupOpToInputTables map with the new GroupByOperator + if(subqueryCtx.getParseContext().getGroupOpToInputTables().containsKey(operator)){ + //we need to get rif of the alias and construct a query only with the base table name + String table = subqueryCtx.getBaseTableName(); + if(table.contains(":")){ + String[] aliasAndTab = table.split(":"); + table = aliasAndTab[1]; + } + String selReplacementCommand = ""; + if(subqueryCtx.getSelectColumnNames().iterator().hasNext()){ + //the query contains the sum aggregation GenericUDAF + selReplacementCommand = "select sum(" + subqueryCtx.getSelectColumnNames().iterator().next() + ") as TOTAL from " + table + + " group by " + subqueryCtx.getSelectColumnNames().iterator().next() + " "; + } + //create a new ParseContext for the query to retrieve its operator tree, and the required GroupByOperator from it + ParseContext newDAGContext = RewriteParseContextGenerator.generateOperatorTree(subqueryCtx.getParseContext().getConf(), + selReplacementCommand); + subqueryCtx.setNewDAGCtx(newDAGContext); + + //we get our new GroupByOperator here + Map> newGbyOpMap = subqueryCtx.getNewDAGCtx().getGroupOpToInputTables(); + GroupByOperator newGbyOperator = newGbyOpMap.keySet().iterator().next(); + + //remove the old GroupByOperator + GroupByDesc oldConf = operator.getConf(); + ArrayList oldAggrList = oldConf.getAggregators(); + if(oldAggrList != null && oldAggrList.size() > 0){ + for (AggregationDesc aggregationDesc : oldAggrList) { + if(aggregationDesc != null && aggregationDesc.getGenericUDAFName().equals("count")){ + oldAggrList.remove(aggregationDesc); + break; + } + + } + } + + //Construct the new AggregationDesc to get rid of the current internal names and replace them with new internal names + //as required by the operator tree + GroupByDesc newConf = newGbyOperator.getConf(); + ArrayList newAggrList = newConf.getAggregators(); + if(newAggrList != null && newAggrList.size() > 0){ + for (AggregationDesc aggregationDesc : newAggrList) { + subqueryCtx.setEval(aggregationDesc.getGenericUDAFEvaluator()); + ArrayList paraList = aggregationDesc.getParameters(); + for (int i=0; i< paraList.size(); i++) { + ExprNodeDesc exprNodeDesc = paraList.get(i); + if(exprNodeDesc instanceof ExprNodeColumnDesc){ + ExprNodeColumnDesc encd = (ExprNodeColumnDesc)exprNodeDesc; + String col = "cnt"; + if(subqueryCtx.getAliasToInternal().containsKey(col)){ + encd.setColumn(subqueryCtx.getAliasToInternal().get(col)); + } + encd.setTabAlias(null); + exprNodeDesc = encd; + } + paraList.set(i, exprNodeDesc); + } + oldAggrList.add(aggregationDesc); + } + } + + //Construct the new colExprMap to get rid of the current internal names and replace them with new internal names + //as required by the operator tree + Map newGbyColExprMap = new LinkedHashMap(); + Map oldGbyColExprMap = operator.getColumnExprMap(); + Set internalNameSet = oldGbyColExprMap.keySet(); + for (String internal : internalNameSet) { + ExprNodeDesc exprNodeDesc = oldGbyColExprMap.get(internal).clone(); + if(exprNodeDesc instanceof ExprNodeColumnDesc){ + ExprNodeColumnDesc encd = (ExprNodeColumnDesc)exprNodeDesc; + String col = encd.getColumn(); + if(subqueryCtx.getSelectColumnNames().contains(col)){ + encd.setColumn(subqueryCtx.getAliasToInternal().get(col)); + } + }else if(exprNodeDesc instanceof ExprNodeGenericFuncDesc){ + List colExprs = ((ExprNodeGenericFuncDesc)exprNodeDesc).getChildExprs(); + for (ExprNodeDesc colExpr : colExprs) { + if(colExpr instanceof ExprNodeColumnDesc){ + ExprNodeColumnDesc encd = (ExprNodeColumnDesc)colExpr; + String col = encd.getColumn(); + if(subqueryCtx.getSelectColumnNames().contains(col)){ + encd.setColumn(subqueryCtx.getAliasToInternal().get(col)); + } + } + } + + } + newGbyColExprMap.put(internal, exprNodeDesc); + } + + //Construct the new group-by keys to get rid of the current internal names and replace them with new internal names + //as required by the operator tree + ArrayList newGbyKeys = new ArrayList(); + ArrayList oldGbyKeys = oldConf.getKeys(); + for (int i =0; i< oldGbyKeys.size(); i++) { + ExprNodeDesc exprNodeDesc = oldGbyKeys.get(i).clone(); + if(exprNodeDesc instanceof ExprNodeColumnDesc){ + ExprNodeColumnDesc encd = (ExprNodeColumnDesc)exprNodeDesc; + String col = encd.getColumn(); + if(subqueryCtx.getSelectColumnNames().contains(col)){ + encd.setColumn(subqueryCtx.getAliasToInternal().get(col)); + } + exprNodeDesc = encd; + }else if(exprNodeDesc instanceof ExprNodeGenericFuncDesc){ + ExprNodeGenericFuncDesc engfd = (ExprNodeGenericFuncDesc)exprNodeDesc; + List colExprs = engfd.getChildExprs(); + for (ExprNodeDesc colExpr : colExprs) { + if(colExpr instanceof ExprNodeColumnDesc){ + ExprNodeColumnDesc encd = (ExprNodeColumnDesc)colExpr; + String col = encd.getColumn(); + if(subqueryCtx.getSelectColumnNames().contains(col)){ + encd.setColumn(subqueryCtx.getAliasToInternal().get(col)); + } + + } + } + } + newGbyKeys.add(exprNodeDesc); + } + + //Construct the new RowSchema. We do not need a alias for the new internalNames + RowSchema oldRS = operator.getSchema(); + ArrayList oldSign = oldRS.getSignature(); + ArrayList newSign = new ArrayList(); + for (ColumnInfo columnInfo : oldSign) { + columnInfo.setAlias(null); + newSign.add(columnInfo); + } + + //reset the above data structures in the original GroupByOperator + oldRS.setSignature(newSign); + operator.setSchema(oldRS); + oldConf.setKeys(newGbyKeys); + oldConf.setAggregators(oldAggrList); + operator.setColumnExprMap(newGbyColExprMap); + operator.setConf(oldConf); + + }else{ + //we just need to reset the GenericUDAFEvaluator and its name for this GroupByOperator whose parent is the + //ReduceSinkOperator + GroupByDesc childConf = (GroupByDesc) operator.getConf(); + ArrayList childAggrList = childConf.getAggregators(); + if(childAggrList != null && childAggrList.size() > 0){ + for (AggregationDesc aggregationDesc : childAggrList) { + aggregationDesc.setGenericUDAFEvaluator(subqueryCtx.getEval()); + aggregationDesc.setGenericUDAFName("sum"); + } + } + + } + + return null; + } + } + + public static NewQueryGroupbySchemaProc getNewQueryGroupbySchemaProc(){ + return new NewQueryGroupbySchemaProc(); + } + + + /** + * This processor corrects the RowResolver for the FilterOperator of the original operator tree using + * the RowResolver obtained from the subquery SelectOperator in SubquerySelectSchemaProc processor. + * It also needs to replace the current internal names with new internal names for all instances of the + * ExprNodeColumnDesc. It recursively calls the setFilterPredicateCol(..) method to set this information correctly. + * + */ + private static class NewQueryFilterSchemaProc implements NodeProcessor { + public Object process(Node nd, Stack stack, NodeProcessorCtx ctx, + Object... nodeOutputs) throws SemanticException { + FilterOperator operator = (FilterOperator)nd; + subqueryCtx = (RewriteIndexSubqueryCtx)ctx; + //Set new RowResolver + operator.getSchema().setSignature(subqueryCtx.getNewRS()); + subqueryCtx.getParseContext().getOpParseCtx().get(operator).setRowResolver(subqueryCtx.getNewRR()); + + //Set correct internalNames + FilterDesc conf = operator.getConf(); + ExprNodeDesc exprNodeDesc = conf.getPredicate(); + setFilterPredicateCol(exprNodeDesc); + conf.setPredicate(exprNodeDesc); + return null; + } + } + + + /** + * This method is recursively called whenever we have our expression node descriptor to be an instance of the ExprNodeGenericFuncDesc. + * We exit the recursion when we find an instance of ExprNodeColumnDesc and set its column name to internal name + * @param exprNodeDesc + */ + private static void setFilterPredicateCol(ExprNodeDesc exprNodeDesc){ + if(exprNodeDesc instanceof ExprNodeColumnDesc){ + ExprNodeColumnDesc encd = (ExprNodeColumnDesc)exprNodeDesc; + String col = encd.getColumn(); + if(subqueryCtx.getSelectColumnNames().contains(col)){ + encd.setColumn(subqueryCtx.getAliasToInternal().get(col)); + } + exprNodeDesc = encd; + }else if(exprNodeDesc instanceof ExprNodeGenericFuncDesc){ + ExprNodeGenericFuncDesc engfd = (ExprNodeGenericFuncDesc)exprNodeDesc; + List colExprs = engfd.getChildExprs(); + for (ExprNodeDesc colExpr : colExprs) { + //continue until you find an instance of the ExprNodeColumnDesc + setFilterPredicateCol(colExpr); + } + } + + } + + + public static NewQueryFilterSchemaProc getNewQueryFilterSchemaProc(){ + return new NewQueryFilterSchemaProc(); + } + + +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/RewriteParseContextGenerator.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/RewriteParseContextGenerator.java new file mode 100644 index 0000000..3d97a4a --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/RewriteParseContextGenerator.java @@ -0,0 +1,100 @@ +package org.apache.hadoop.hive.ql.optimizer; + +import java.io.IOException; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.ql.Context; +import org.apache.hadoop.hive.ql.parse.ASTNode; +import org.apache.hadoop.hive.ql.parse.BaseSemanticAnalyzer; +import org.apache.hadoop.hive.ql.parse.ParseContext; +import org.apache.hadoop.hive.ql.parse.ParseDriver; +import org.apache.hadoop.hive.ql.parse.ParseException; +import org.apache.hadoop.hive.ql.parse.ParseUtils; +import org.apache.hadoop.hive.ql.parse.QB; +import org.apache.hadoop.hive.ql.parse.SemanticAnalyzer; +import org.apache.hadoop.hive.ql.parse.SemanticAnalyzerFactory; +import org.apache.hadoop.hive.ql.parse.SemanticException; + +/** + * RewriteParseContextGenerator is a class that offers methods to generate operator tree + * for input queries. It is implemented on lines of the analyzeInternal(..) method + * of {@link SemanticAnalyzer} but it creates only the ParseContext for the input query command. + * It does not optimize or generate map-reduce tasks for the input query. + * This can be used when you need to create operator tree for an internal query. + * For example, {@link RewriteGBUsingIndex} uses the {@link RewriteIndexSubqueryProcFactory} methods to + * generate subquery that scans over index table rather than original table. + * + */ +public final class RewriteParseContextGenerator { + protected static Log LOG = LogFactory.getLog(RewriteParseContextGenerator.class.getName()); + + /** + * Parse the input {@link String} command and generate a ASTNode tree + * @param conf + * @param command + * @return + */ + public static ParseContext generateOperatorTree(HiveConf conf, String command){ + Context ctx; + ParseContext subPCtx = null; + try { + ctx = new Context(conf); + ParseDriver pd = new ParseDriver(); + ASTNode tree = pd.parse(command, ctx); + tree = ParseUtils.findRootNonNullToken(tree); + + BaseSemanticAnalyzer sem = SemanticAnalyzerFactory.get(conf, tree); + doSemanticAnalysis(sem, tree, ctx); + + subPCtx = ((SemanticAnalyzer) sem).getParseContext(); + LOG.info("Sub-query Semantic Analysis Completed"); + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } catch (ParseException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } catch (SemanticException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + return subPCtx; + + } + + /** + * For the input ASTNode tree, perform a semantic analysis and check metadata + * Generate a operator tree and return the {@link ParseContext} instance for the operator tree + * + * @param ctx + * @param sem + * @param ast + * @return + * @throws SemanticException + */ + private static void doSemanticAnalysis(BaseSemanticAnalyzer sem, ASTNode ast, Context ctx) throws SemanticException { + + if(sem instanceof SemanticAnalyzer){ + QB qb = new QB(null, null, false); + ASTNode child = ast; + ParseContext subPCtx = ((SemanticAnalyzer) sem).getParseContext(); + subPCtx.setContext(ctx); + ((SemanticAnalyzer) sem).init(subPCtx); + + LOG.info("Starting Sub-query Semantic Analysis"); + ((SemanticAnalyzer) sem).doPhase1(child, qb, ((SemanticAnalyzer) sem).initPhase1Ctx()); + LOG.info("Completed phase 1 of Sub-query Semantic Analysis"); + + ((SemanticAnalyzer) sem).getMetaData(qb); + LOG.info("Completed getting MetaData in Sub-query Semantic Analysis"); + + LOG.info("Sub-query Abstract syntax tree: " + ast.toStringTree()); + ((SemanticAnalyzer) sem).genPlan(qb); + + LOG.info("Sub-query Completed plan generation"); + } + } + +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/RewriteRemoveGroupbyCtx.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/RewriteRemoveGroupbyCtx.java new file mode 100644 index 0000000..8662a41 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/RewriteRemoveGroupbyCtx.java @@ -0,0 +1,222 @@ +package org.apache.hadoop.hive.ql.optimizer; + +import java.io.Serializable; +import java.util.ArrayList; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Stack; + +import org.apache.hadoop.hive.ql.exec.Operator; +import org.apache.hadoop.hive.ql.exec.TableScanOperator; +import org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher; +import org.apache.hadoop.hive.ql.lib.Dispatcher; +import org.apache.hadoop.hive.ql.lib.GraphWalker; +import org.apache.hadoop.hive.ql.lib.Node; +import org.apache.hadoop.hive.ql.lib.NodeProcessor; +import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx; +import org.apache.hadoop.hive.ql.lib.PreOrderWalker; +import org.apache.hadoop.hive.ql.lib.Rule; +import org.apache.hadoop.hive.ql.lib.RuleRegExp; +import org.apache.hadoop.hive.ql.metadata.Hive; +import org.apache.hadoop.hive.ql.parse.ASTNode; +import org.apache.hadoop.hive.ql.parse.HiveParser; +import org.apache.hadoop.hive.ql.parse.OpParseContext; +import org.apache.hadoop.hive.ql.parse.ParseContext; +import org.apache.hadoop.hive.ql.parse.SemanticException; + +/** + * RewriteRemoveGroupbyCtx class stores the context for the {@link RewriteRemoveGroupbyProcFactory} processor factory methods + */ +public class RewriteRemoveGroupbyCtx implements NodeProcessorCtx { + + private RewriteRemoveGroupbyCtx(ParseContext parseContext, Hive hiveDb, String indexTableName){ + //this prevents the class from getting instantiated + this.parseContext = parseContext; + this.hiveDb = hiveDb; + this.indexName = indexTableName; + this.opc = parseContext.getOpParseCtx(); + } + + public static RewriteRemoveGroupbyCtx getInstance(ParseContext parseContext, Hive hiveDb, String indexTableName){ + return new RewriteRemoveGroupbyCtx(parseContext, hiveDb, indexTableName); + } + + //We need these two ArrayLists to reset the parent operator list and child operator list in the operator tree + // once we remove the operators that represent the group-by construct + private final List> newParentList = new ArrayList>(); + private final List> newChildrenList = new ArrayList>(); + + //We need to remove the operators from OpParseContext to remove them from the operator tree + private LinkedHashMap, OpParseContext> opc = new LinkedHashMap, OpParseContext>(); + private final Hive hiveDb; + private final ParseContext parseContext; + + //We need the RewriteCanApplyCtx instance to retrieve the mapping from original table to index table in the + // getReplaceTableScanProc() method of the RewriteRemoveGroupbyProcFactory + //private RewriteCanApplyCtx canApplyCtx; + private final String indexName; + + public List> getNewParentList() { + return newParentList; + } + + public List> getNewChildrenList() { + return newChildrenList; + } + + public LinkedHashMap, OpParseContext> getOpc() { + return opc; + } + + public ParseContext getParseContext() { + return parseContext; + } + + public Hive getHiveDb() { + return hiveDb; + } + + public String getIndexName() { + return indexName; + } + + /** + * Given a root node of the parse tree, this function returns the "first" TOK_FUNCTION node + * that matches the input function name + * + * @param root + * @return + */ + ASTNode getFuncNode(ASTNode root, String funcName){ + ASTNode func = null; + ArrayList cList = root.getChildren(); + while(cList != null && cList.size() > 0){ + for (Node node : cList) { + if(null != node){ + ASTNode curr = (ASTNode)node; + if(curr.getType() == HiveParser.TOK_FUNCTION){ + ArrayList funcChildren = curr.getChildren(); + for (Node child : funcChildren) { + ASTNode funcChild = (ASTNode)child; + if(funcChild.getText().equals(funcName)){ + func = curr; + cList = null; + break; + } + } + }else{ + cList = curr.getChildren(); + continue; + } + } + } + } + return func; + } + + + /** + * Given an input operator, this function returns the top TableScanOperator for the operator tree + * @param inputOp + * @return + */ + Operator getTopOperator(Operator inputOp){ + Operator tsOp = null; + List> parentList = inputOp.getParentOperators(); + while(parentList != null && parentList.size() > 0){ + for (Operator op : parentList) { + if(op != null){ + if(op instanceof TableScanOperator){ + tsOp = (TableScanOperator) op; + parentList = null; + break; + }else{ + parentList = op.getParentOperators(); + continue; + } + } + } + } + + return tsOp; + } + + + /** + * Walk the original operator tree using the {@link PreOrderWalker} using the rules. + * Each of the rules invoke respective methods from the {@link RewriteRemoveGroupbyProcFactory} + * to remove the group-by constructs from the original query and replace the original + * {@link TableScanOperator} with the new index table scan operator. + * + * @param topOp + * @throws SemanticException + */ + public void invokeRemoveGbyProc(Operator topOp) throws SemanticException{ + Map opRules = new LinkedHashMap(); + + // replace scan operator containing original table with index table + opRules.put(new RuleRegExp("R1", "TS%"), RewriteRemoveGroupbyProcFactory.getReplaceTableScanProc()); + //rule that replaces index key selection with size(_offsets) function in original query + opRules.put(new RuleRegExp("R2", "SEL%"), RewriteRemoveGroupbyProcFactory.getReplaceIdxKeyWithSizeFuncProc()); + // remove group-by pattern from original operator tree + opRules.put(new RuleRegExp("R3", "GBY%RS%GBY%"), RewriteRemoveGroupbyProcFactory.getRemoveGroupByProc()); + + // The dispatcher fires the processor corresponding to the closest matching + // rule and passes the context along + Dispatcher disp = new DefaultRuleDispatcher(getDefaultProc(), opRules, this); + GraphWalker ogw = new PreOrderWalker(disp); + + // Create a list of topop nodes + ArrayList topNodes = new ArrayList(); + topNodes.add(topOp); + ogw.startWalking(topNodes, null); + + } + + + /** + * Walk the original operator tree using the {@link PreOrderWalker} using the rules. + * Each of the rules invoke respective methods from the {@link RewriteRemoveGroupbyProcFactory} + * to replace the original {@link TableScanOperator} with the new index table scan operator. + * + * @param topOp + * @throws SemanticException + */ + public void invokeReplaceTableScanProc(Operator topOp) throws SemanticException{ + Map opRules = new LinkedHashMap(); + + // replace scan operator containing original table with index table + opRules.put(new RuleRegExp("R1", "TS%"), RewriteRemoveGroupbyProcFactory.getReplaceTableScanProc()); + + // The dispatcher fires the processor corresponding to the closest matching + // rule and passes the context along + Dispatcher disp = new DefaultRuleDispatcher(getDefaultProc(), opRules, this); + GraphWalker ogw = new PreOrderWalker(disp); + + // Create a list of topop nodes + ArrayList topNodes = new ArrayList(); + topNodes.add(topOp); + ogw.startWalking(topNodes, null); + + } + + /** + * Default procedure for {@link DefaultRuleDispatcher} + * @return + */ + private NodeProcessor getDefaultProc() { + return new NodeProcessor() { + @Override + public Object process(Node nd, Stack stack, + NodeProcessorCtx procCtx, Object... nodeOutputs) throws SemanticException { + return null; + } + }; + } + + + + +} + diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/RewriteRemoveGroupbyProcFactory.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/RewriteRemoveGroupbyProcFactory.java new file mode 100644 index 0000000..1a04855 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/RewriteRemoveGroupbyProcFactory.java @@ -0,0 +1,339 @@ +package org.apache.hadoop.hive.ql.optimizer; + +import java.io.IOException; +import java.io.Serializable; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.Stack; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.ql.Context; +import org.apache.hadoop.hive.ql.exec.ColumnInfo; +import org.apache.hadoop.hive.ql.exec.GroupByOperator; +import org.apache.hadoop.hive.ql.exec.Operator; +import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator; +import org.apache.hadoop.hive.ql.exec.RowSchema; +import org.apache.hadoop.hive.ql.exec.SelectOperator; +import org.apache.hadoop.hive.ql.exec.TableScanOperator; +import org.apache.hadoop.hive.ql.lib.Node; +import org.apache.hadoop.hive.ql.lib.NodeProcessor; +import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx; +import org.apache.hadoop.hive.ql.metadata.Table; +import org.apache.hadoop.hive.ql.parse.ASTNode; +import org.apache.hadoop.hive.ql.parse.BaseSemanticAnalyzer; +import org.apache.hadoop.hive.ql.parse.OpParseContext; +import org.apache.hadoop.hive.ql.parse.ParseDriver; +import org.apache.hadoop.hive.ql.parse.ParseException; +import org.apache.hadoop.hive.ql.parse.ParseUtils; +import org.apache.hadoop.hive.ql.parse.RowResolver; +import org.apache.hadoop.hive.ql.parse.SemanticAnalyzer; +import org.apache.hadoop.hive.ql.parse.SemanticAnalyzerFactory; +import org.apache.hadoop.hive.ql.parse.SemanticException; +import org.apache.hadoop.hive.ql.parse.BaseSemanticAnalyzer.tableSpec; +import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc; +import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; +import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc; +import org.apache.hadoop.hive.ql.plan.SelectDesc; +import org.apache.hadoop.hive.ql.plan.TableScanDesc; +import org.apache.hadoop.hive.serde2.SerDeException; +import org.apache.hadoop.hive.serde2.objectinspector.StructField; +import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; + +/** + * Factory of processors used by {@link RewriteGBUsingIndex} (see invokeRemoveGbyProc(..) method) + * Each of the processors are invoked according to a rule and serve towards removing + * group-by construct from original operator tree + * + */ +public final class RewriteRemoveGroupbyProcFactory { + protected final static Log LOG = LogFactory.getLog(RewriteRemoveGroupbyProcFactory.class.getName()); + private static RewriteRemoveGroupbyCtx removeGbyCtx = null; + + private RewriteRemoveGroupbyProcFactory() { + //this prevents the class from getting instantiated + } + + /** + * This processor removes the SelectOperator whose child is a GroupByOperator from the operator tree (OpParseContext). + * When we remove the group-by construct from the query, we do not need this SelectOperator which worked initially as an + * interim operator to pass arguments from the parent TableScanOperator to the child GroupByOperator (Remember that the genPlan(..) + * method creates the operators bottom-up FROM-WHERE-GROUPBY-ORDER-BY-SELECT etc) + * + * Since we need to remove the group-by construct (comprising of GBY-RS-GBY operators and interim SEL operator), the processor sets the + * appropriate parent-child links. + * + * The processor also constructs a ExprNodeDesc instance for the size(_offsets) function and replaces the index key columns + * with this function descriptor. It also sets the rowSchema, colList and colExprMap data structures correctly for this SelectOperator + * to accommodate the new replacement and removal of group-by construct + * + */ + private static class ReplaceIdxKeyWithSizeFunc implements NodeProcessor { + public Object process(Node nd, Stack stack, NodeProcessorCtx ctx, + Object... nodeOutputs) throws SemanticException { + SelectOperator operator = (SelectOperator)nd; + removeGbyCtx = (RewriteRemoveGroupbyCtx)ctx; + + //as of now, we have hard-coded the positions as get(0) etc as whenever a group-by construct appears in teh operator tree, + //it comes in the SEL-GBY-RS-SEL combination. This lets us presume that the parent or child operator will always be + // at the 0th position in the DAG operator tree + List> childrenList = operator.getChildOperators(); + Operator child = childrenList.get(0); + Operator parent = operator.getParentOperators().get(0); + + if(child instanceof GroupByOperator){ + //this is the interim SEL operator for the group-by construct, we do not need this in the re-written operator tree + removeGbyCtx.getNewParentList().addAll(operator.getParentOperators()); + removeGbyCtx.getOpc().remove(operator); + }else if(parent instanceof GroupByOperator){ + + // set the child operator list of interim SEL's parent operator to be the child operator list of the GroupByOperator + removeGbyCtx.getNewParentList().get(0).setChildOperators(removeGbyCtx.getNewChildrenList()); + // set the parent operator list for the SelectOperator (whose parent operator is GroupByOperator) + //to be the parent list of interim SEL operator + removeGbyCtx.getNewChildrenList().get(0).setParentOperators(removeGbyCtx.getNewParentList()); + + //This code parses the string command and constructs a ASTNode parse tree + //we need this to construct the ExprNodeDesc for the size(_offsets) function + HiveConf conf = removeGbyCtx.getParseContext().getConf(); + Context context = null; + ASTNode tree = null; + BaseSemanticAnalyzer sem = null; + String newSelCommand = "select size(`_offsets`) from " + removeGbyCtx.getIndexName(); + try { + context = new Context(conf); + ParseDriver pd = new ParseDriver(); + tree = pd.parse(newSelCommand, context); + tree = ParseUtils.findRootNonNullToken(tree); + sem = SemanticAnalyzerFactory.get(conf, tree); + + } catch (ParseException e) { + LOG.info("ParseException in ReplaceIdxKeyWithSizeFunc"); + e.printStackTrace(); + } catch (SemanticException e) { + LOG.info("SemanticException in ReplaceIdxKeyWithSizeFunc"); + e.printStackTrace(); + } catch (IOException e) { + LOG.info("IOException in ReplaceIdxKeyWithSizeFunc"); + e.printStackTrace(); + } + + //We retrieve the ASTNode function token from the root tree + ASTNode funcNode = removeGbyCtx.getFuncNode(tree, "size"); + + //We need the rowResolver of the parent TableScanOperator to fix the rowSchema, colList, colExprMap of the SelectOperator + //and also to construct the ExprNodeDesc to replace the index key columns with size(_offsets) GenericUDF + LinkedHashMap, OpParseContext> opCtxMap = + removeGbyCtx.getParseContext().getOpParseCtx(); + Operator tsOp = removeGbyCtx.getTopOperator(operator); + OpParseContext tsCtx = opCtxMap.get(tsOp); + ExprNodeDesc exprNode = ((SemanticAnalyzer) sem).genExprNodeDesc(funcNode, tsCtx.getRowResolver()); + + //We need the name of the GenericUDF function to correct the rowSchema + String funcName = ""; + + if(exprNode instanceof ExprNodeGenericFuncDesc){ + List exprList = ((ExprNodeGenericFuncDesc) exprNode).getChildExprs(); + for (ExprNodeDesc exprNodeDesc : exprList) { + if(exprNodeDesc instanceof ExprNodeColumnDesc){ + funcName = ((ExprNodeColumnDesc) exprNodeDesc).getColumn(); + } + } + } + + SelectDesc selDesc = (SelectDesc) operator.getConf(); + //Since we have removed the interim SEL operator when we removed the group-by construct, we need to get rid + //of the internal names in the colList and colExprMap of this SelectOperator + //internalToAlias map gives us this mapping to correct these data structures + HashMap internalToAlias = new LinkedHashMap(); + + //Set the new RowSchema and populate the internalToAlias map + RowSchema rs = operator.getSchema(); + ArrayList newRS = new ArrayList(); + ArrayList sign = rs.getSignature(); + for (ColumnInfo columnInfo : sign) { + String alias = columnInfo.getAlias(); + String internalName = columnInfo.getInternalName(); + internalToAlias.put(internalName, alias); + //the function name always has alias starting with _c (for eg. _c1 etc) + //We need to set the new alias (_offsets) for the initial "_c1" in rowSchema + if(alias != null && alias.startsWith("_c")){ + columnInfo.setAlias(funcName); + } + newRS.add(columnInfo); + } + operator.getSchema().setSignature(newRS); + + //Set the colList of this SelectOperator + ArrayList colList = selDesc.getColList(); + int i = 0; + for (; i< colList.size(); i++) { + ExprNodeDesc exprNodeDesc = colList.get(i); + if(exprNodeDesc instanceof ExprNodeColumnDesc){ + String internal = ((ExprNodeColumnDesc)exprNodeDesc).getColumn(); + //get rid of the internal column names like _col0, _col1 and replace them with their actual names i.e. alias + if(internalToAlias.get(internal) != null){ + ((ExprNodeColumnDesc) exprNodeDesc).setColumn(internalToAlias.get(internal)); + } + //however, if the alias itself is the internal name of the function argument, say _c1, we need to replace the + //ExprNodeColumnDesc instance with the ExprNodeGenericFuncDesc (i.e. exprNode here) + //this replaces the count(literal) or count(index_key) function with size(_offsets) + if(((ExprNodeColumnDesc) exprNodeDesc).getColumn().startsWith("_c")){ + colList.set(i, exprNode); + } + } + } + + selDesc.setColList(colList); + + //Set the new colExprMap for this SelectOperator + Map origColExprMap = operator.getColumnExprMap(); + Map newColExprMap = new LinkedHashMap(); + Set internalNamesList = origColExprMap.keySet(); + for (String internal : internalNamesList) { + ExprNodeDesc end = origColExprMap.get(internal).clone(); + if(end instanceof ExprNodeColumnDesc){ + //get rid of the internal column names like _col0, _col1 and replace them with their actual names i.e. alias + if(internalToAlias.get(internal) != null){ + ((ExprNodeColumnDesc) end).setColumn(internalToAlias.get(internal)); + } + //this replaces the count(literal) or count(index_key) function with size(_offsets) + if(((ExprNodeColumnDesc) end).getColumn().startsWith("_c")){ + newColExprMap.put(internal, exprNode); + }else{ + newColExprMap.put(internal, end); + } + }else{ + newColExprMap.put(internal, end); + } + } + operator.setColumnExprMap(newColExprMap); + } + return null; + } + } + + public static ReplaceIdxKeyWithSizeFunc getReplaceIdxKeyWithSizeFuncProc(){ + return new ReplaceIdxKeyWithSizeFunc(); + } + + + /** + * This processor replaces the original TableScanOperator with the new TableScanOperator and metadata that scans over the + * index table rather than scanning over the orginal table. + * + */ + private static class RepaceTableScanOpProc implements NodeProcessor { + public Object process(Node nd, Stack stack, NodeProcessorCtx ctx, + Object... nodeOutputs) throws SemanticException { + TableScanOperator scanOperator = (TableScanOperator)nd; + removeGbyCtx = (RewriteRemoveGroupbyCtx)ctx; + + HashMap topToTable = + removeGbyCtx.getParseContext().getTopToTable(); + + //Check if we have a valid index on the original base table for the replacement +/* String baseTableName = topToTable.get(scanOperator).getTableName(); + if( removeGbyCtx.getCanApplyCtx().findBaseTable(baseTableName) == null ) { + LOG.debug("No mapping found for original table and index table name"); + } +*/ + //construct a new descriptor for the index table scan + TableScanDesc indexTableScanDesc = new TableScanDesc(); + indexTableScanDesc.setGatherStats(false); + + //String tableName = removeGbyCtx.getCanApplyCtx().findBaseTable(baseTableName); + String tableName = removeGbyCtx.getIndexName(); + + tableSpec ts = new tableSpec(removeGbyCtx.getHiveDb(), + removeGbyCtx.getParseContext().getConf(), + tableName + ); + String k = tableName + Path.SEPARATOR; + indexTableScanDesc.setStatsAggPrefix(k); + scanOperator.setConf(indexTableScanDesc); + + //remove original TableScanOperator + topToTable.clear(); + removeGbyCtx.getParseContext().getTopOps().clear(); + + //Scan operator now points to other table + scanOperator.setAlias(tableName); + topToTable.put(scanOperator, ts.tableHandle); + removeGbyCtx.getParseContext().setTopToTable(topToTable); + + OpParseContext operatorContext = + removeGbyCtx.getParseContext().getOpParseCtx().get(scanOperator); + RowResolver rr = new RowResolver(); + removeGbyCtx.getParseContext().getOpParseCtx().remove(scanOperator); + + + //Construct the new RowResolver for the new TableScanOperator + try { + StructObjectInspector rowObjectInspector = (StructObjectInspector) ts.tableHandle.getDeserializer().getObjectInspector(); + List fields = rowObjectInspector + .getAllStructFieldRefs(); + for (int i = 0; i < fields.size(); i++) { + rr.put(tableName, fields.get(i).getFieldName(), new ColumnInfo(fields + .get(i).getFieldName(), TypeInfoUtils + .getTypeInfoFromObjectInspector(fields.get(i) + .getFieldObjectInspector()), tableName, false)); + } + } catch (SerDeException e) { + throw new RuntimeException(e); + } + //Set row resolver for new table + operatorContext.setRowResolver(rr); + + //Put the new TableScanOperator in the OpParseContext and topOps maps of the original ParseContext + removeGbyCtx.getParseContext().getOpParseCtx().put(scanOperator, operatorContext); + removeGbyCtx.getParseContext().getTopOps().put(tableName, scanOperator); + return null; + } + } + + public static RepaceTableScanOpProc getReplaceTableScanProc(){ + return new RepaceTableScanOpProc(); + } + + /** + * This processor removes the GroupBy operators and the interim ReduceSinkOperator from the OpParseContext + * + */ + private static class RemoveGBYProc implements NodeProcessor { + public Object process(Node nd, Stack stack, NodeProcessorCtx ctx, + Object... nodeOutputs) throws SemanticException { + GroupByOperator operator = (GroupByOperator)nd; + removeGbyCtx = (RewriteRemoveGroupbyCtx)ctx; + //On walking the operator tree using the rule 'GBY-RS-GBY', we get the GroupByOperator that is not in the 'groupOpToInputTables' + //map in the ParseContext. Hence the check. + if(!removeGbyCtx.getParseContext().getGroupOpToInputTables().containsKey(operator)){ + removeGbyCtx.getNewChildrenList().addAll(operator.getChildOperators()); + + ReduceSinkOperator rsOp = (ReduceSinkOperator) operator.getParentOperators().get(0); + removeGbyCtx.getOpc().remove(rsOp); + + GroupByOperator gbyOp = (GroupByOperator) rsOp.getParentOperators().get(0); + //we need to remove this GBY operator from the groupOpToInputTables map from ParseContext as well + removeGbyCtx.getParseContext().getGroupOpToInputTables().remove(gbyOp); + removeGbyCtx.getOpc().remove(gbyOp); + + } + + return null; + } + } + + public static RemoveGBYProc getRemoveGroupByProc(){ + return new RemoveGBYProc(); + } + + +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/BaseSemanticAnalyzer.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/BaseSemanticAnalyzer.java index d8442b2..fd012a5 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/parse/BaseSemanticAnalyzer.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/BaseSemanticAnalyzer.java @@ -71,7 +71,7 @@ public abstract class BaseSemanticAnalyzer { protected Context ctx; protected HashMap idToTableNameMap; - + public static int HIVE_COLUMN_ORDER_ASC = 1; public static int HIVE_COLUMN_ORDER_DESC = 0; @@ -583,6 +583,22 @@ public abstract class BaseSemanticAnalyzer { public static enum SpecType {TABLE_ONLY, STATIC_PARTITION, DYNAMIC_PARTITION}; public SpecType specType; + public tableSpec(Hive db, HiveConf conf, String tableName) throws SemanticException { + this.tableName = tableName; + + try { + this.tableHandle = db.getTable(tableName); + } catch (HiveException e) { + //XTODO: Throw semantic exception here + throw new SemanticException(ErrorMsg.GENERIC_ERROR.getMsg(tableName), e); + } + this.specType = SpecType.TABLE_ONLY; + + } + private Table getTable(String tableName2) { + // TODO Auto-generated method stub + return null; + } public tableSpec(Hive db, HiveConf conf, ASTNode ast) throws SemanticException { @@ -719,7 +735,7 @@ public abstract class BaseSemanticAnalyzer { } return partSpec; } - + public Hive getDb() { return db; } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java index 15e7a13..cc0ee20 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java @@ -27,9 +27,9 @@ import java.util.Iterator; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; -import java.util.Map.Entry; import java.util.Set; import java.util.TreeSet; +import java.util.Map.Entry; import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; @@ -91,7 +91,6 @@ import org.apache.hadoop.hive.ql.metadata.VirtualColumn; import org.apache.hadoop.hive.ql.optimizer.GenMRFileSink1; import org.apache.hadoop.hive.ql.optimizer.GenMROperator; import org.apache.hadoop.hive.ql.optimizer.GenMRProcContext; -import org.apache.hadoop.hive.ql.optimizer.GenMRProcContext.GenMapRedCtx; import org.apache.hadoop.hive.ql.optimizer.GenMRRedSink1; import org.apache.hadoop.hive.ql.optimizer.GenMRRedSink2; import org.apache.hadoop.hive.ql.optimizer.GenMRRedSink3; @@ -101,6 +100,7 @@ import org.apache.hadoop.hive.ql.optimizer.GenMRUnion1; import org.apache.hadoop.hive.ql.optimizer.GenMapRedUtils; import org.apache.hadoop.hive.ql.optimizer.MapJoinFactory; import org.apache.hadoop.hive.ql.optimizer.Optimizer; +import org.apache.hadoop.hive.ql.optimizer.GenMRProcContext.GenMapRedCtx; import org.apache.hadoop.hive.ql.optimizer.physical.PhysicalContext; import org.apache.hadoop.hive.ql.optimizer.physical.PhysicalOptimizer; import org.apache.hadoop.hive.ql.optimizer.ppr.PartitionPruner; @@ -121,7 +121,6 @@ import org.apache.hadoop.hive.ql.plan.ExtractDesc; import org.apache.hadoop.hive.ql.plan.FetchWork; import org.apache.hadoop.hive.ql.plan.FileSinkDesc; import org.apache.hadoop.hive.ql.plan.FilterDesc; -import org.apache.hadoop.hive.ql.plan.FilterDesc.sampleDesc; import org.apache.hadoop.hive.ql.plan.ForwardDesc; import org.apache.hadoop.hive.ql.plan.GroupByDesc; import org.apache.hadoop.hive.ql.plan.HiveOperation; @@ -144,12 +143,13 @@ import org.apache.hadoop.hive.ql.plan.TableDesc; import org.apache.hadoop.hive.ql.plan.TableScanDesc; import org.apache.hadoop.hive.ql.plan.UDTFDesc; import org.apache.hadoop.hive.ql.plan.UnionDesc; +import org.apache.hadoop.hive.ql.plan.FilterDesc.sampleDesc; import org.apache.hadoop.hive.ql.session.SessionState; import org.apache.hadoop.hive.ql.session.SessionState.ResourceType; import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator; -import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator.Mode; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFHash; import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator.Mode; import org.apache.hadoop.hive.serde.Constants; import org.apache.hadoop.hive.serde2.Deserializer; import org.apache.hadoop.hive.serde2.MetadataTypedColumnsetSerDe; @@ -157,9 +157,9 @@ import org.apache.hadoop.hive.serde2.SerDeException; import org.apache.hadoop.hive.serde2.SerDeUtils; import org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category; import org.apache.hadoop.hive.serde2.objectinspector.StructField; import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; @@ -7391,4 +7391,4 @@ public class SemanticAnalyzer extends BaseSemanticAnalyzer { return conf.getIntVar(HiveConf.ConfVars.HADOOPNUMREDUCERS); } -} +} \ No newline at end of file diff --git a/ql/src/test/org/apache/hadoop/hive/ql/QTestUtil.java b/ql/src/test/org/apache/hadoop/hive/ql/QTestUtil.java index 6a5eec3..a4dc28d 100644 --- a/ql/src/test/org/apache/hadoop/hive/ql/QTestUtil.java +++ b/ql/src/test/org/apache/hadoop/hive/ql/QTestUtil.java @@ -330,7 +330,17 @@ public class QTestUtil { db.setCurrentDatabase(dbName); for (String tblName : db.getAllTables()) { if (!DEFAULT_DATABASE_NAME.equals(dbName) || !srcTables.contains(tblName)) { - db.dropTable(dbName, tblName); + Table table = db.getTable(dbName, tblName, false); + if (MetaStoreUtils.isIndexTable(table.getTTable())) { + // Skip the index type table here. + // XTODO: Assuming (but verify) + // - Drop table automatically drops indexes on that table too. + // - No other case results into dangling indexes i.e. where indexes are + // left behind but orig (base) table no longer exists. + } + else { + db.dropTable(dbName, tblName); + } } } if (!DEFAULT_DATABASE_NAME.equals(dbName)) { diff --git a/ql/src/test/queries/clientnegative/fatal.q b/ql/src/test/queries/clientnegative/fatal.q new file mode 100644 index 0000000..367e0fc --- /dev/null +++ b/ql/src/test/queries/clientnegative/fatal.q @@ -0,0 +1,4 @@ +set hive.mapjoin.maxsize=1; +set hive.task.progress=true; + +select /*+ mapjoin(b) */ * from src a join src b on (a.key=b.key); diff --git a/ql/src/test/queries/clientpositive/ql_rewrite_gbtoidx.q b/ql/src/test/queries/clientpositive/ql_rewrite_gbtoidx.q new file mode 100644 index 0000000..cccd1ec --- /dev/null +++ b/ql/src/test/queries/clientpositive/ql_rewrite_gbtoidx.q @@ -0,0 +1,162 @@ + +DROP TABLE lineitem; +CREATE TABLE lineitem (L_ORDERKEY INT, + L_PARTKEY INT, + L_SUPPKEY INT, + L_LINENUMBER INT, + L_QUANTITY DOUBLE, + L_EXTENDEDPRICE DOUBLE, + L_DISCOUNT DOUBLE, + L_TAX DOUBLE, + L_RETURNFLAG STRING, + L_LINESTATUS STRING, + l_shipdate STRING, + L_COMMITDATE STRING, + L_RECEIPTDATE STRING, + L_SHIPINSTRUCT STRING, + L_SHIPMODE STRING, + L_COMMENT STRING) +ROW FORMAT DELIMITED +FIELDS TERMINATED BY '|'; + +CREATE INDEX lineitem_lshipdate_idx ON TABLE lineitem(l_shipdate) AS 'org.apache.hadoop.hive.ql.index.compact.CompactIndexHandler' WITH DEFERRED REBUILD; +ALTER INDEX lineitem_lshipdate_idx ON lineitem REBUILD; + +set hive.optimize.gbyusingindex=true; + +explain select l_shipdate, + count(1) +from +lineitem +group by l_shipdate; + + +explain select lastyear.month, + thisyear.month, + (thisyear.monthly_shipments - lastyear.monthly_shipments) / +lastyear.monthly_shipments as monthly_shipments_delta + from (select year(l_shipdate) as year, + month(l_shipdate) as month, + count(1) as monthly_shipments + from lineitem + where year(l_shipdate) = 1997 + group by year(l_shipdate), month(l_shipdate) + ) lastyear join + (select year(l_shipdate) as year, + month(l_shipdate) as month, + count(1) as monthly_shipments + from lineitem + where year(l_shipdate) = 1998 + group by year(l_shipdate), month(l_shipdate) + ) thisyear + on lastyear.month = thisyear.month; + + + +explain select year(l_shipdate) as year, + month(l_shipdate) as month, + count(1) as monthly_shipments +from lineitem +group by year(l_shipdate), month(l_shipdate); + + + +explain select year(l_shipdate) as year, + month(l_shipdate) as month, + sum(sz) +from ( +select l_shipdate, size(`_offsets`) as sz +from default__lineitem_lineitem_lshipdate_idx__ +) t +group by year(l_shipdate), month(l_shipdate); + + +explain select year(l_shipdate) as year, + month(l_shipdate) as month, + count(1) as monthly_shipments +from lineitem +group by year(l_shipdate), month(l_shipdate); + + + + + + + + + + + +explain select year(l_shipdate) as year, + month(l_shipdate) as month, + sum(sz) +from ( +select l_shipdate, size(`_offsets`) as sz +from default__lineitem_lineitem_lshipdate_idx__ +) t +group by year(l_shipdate), month(l_shipdate); + + +explain select year(L_SHIPDATE), month(L_SHIPDATE) as month_bkt, COUNT(1) + from lineitem +group by year(L_SHIPDATE), month(L_SHIPDATE); + + +explain select lastyear.month, + thisyear.month, + (thisyear.monthly_shipments - lastyear.monthly_shipments) / +lastyear.monthly_shipments as monthly_shipments_delta + from (select year(l_shipdate) as year, + month(l_shipdate) as month, + count(1) as monthly_shipments + from lineitem + where year(l_shipdate) = 1997 + group by year(l_shipdate), month(l_shipdate) + ) lastyear join + (select year(l_shipdate) as year, + month(l_shipdate) as month, + count(1) as monthly_shipments + from lineitem + where year(l_shipdate) = 1998 + group by year(l_shipdate), month(l_shipdate) + ) thisyear + on lastyear.month = thisyear.month + and lastyear.year = thisyear.year; + + +DROP TABLE tbl; +CREATE TABLE tbl(key int, value int); +CREATE INDEX tbl_key_idx ON TABLE tbl(key) AS 'org.apache.hadoop.hive.ql.index.compact.CompactIndexHandler' WITH DEFERRED REBUILD; +ALTER INDEX tbl_key_idx ON tbl REBUILD; +set hive.optimize.gbyusingindex=true; +EXPLAIN select key, count(key) from tbl where key = 1 group by key; +EXPLAIN SELECT DISTINCT key FROM tbl; +EXPLAIN select count(1) from tbl; +EXPLAIN select key, count(key) from tbl group by key; +EXPLAIN select count(key) from tbl; +EXPLAIN SELECT DISTINCT key FROM tbl; +EXPLAIN SELECT key FROM tbl GROUP BY key; + +EXPLAIN SELECT DISTINCT key FROM tbl; +EXPLAIN SELECT DISTINCT key, value FROM tbl; + +EXPLAIN SELECT key FROM tbl GROUP BY key; +EXPLAIN SELECT key FROM tbl GROUP BY value, key; +EXPLAIN SELECT key, value FROM tbl GROUP BY value, key; + +EXPLAIN SELECT DISTINCT key, value FROM tbl WHERE value = 2; +EXPLAIN SELECT DISTINCT key, value FROM tbl WHERE value = 2 AND key = 3; +EXPLAIN SELECT DISTINCT key, value FROM tbl WHERE value = key; + + +EXPLAIN SELECT key FROM tbl WHERE key = 3 GROUP BY key; +EXPLAIN SELECT key, value FROM tbl WHERE value = 1 GROUP BY key, value; + +EXPLAIN SELECT * FROM (SELECT DISTINCT key, value FROM tbl) v1 WHERE v1.value = 2; + +EXPLAIN SELECT key FROM tbl WHERE value = 2 GROUP BY key; +EXPLAIN SELECT DISTINCT key, substr(value,2,3) FROM tbl WHERE value = key; +EXPLAIN SELECT DISTINCT key, substr(value,2,3) FROM tbl; +EXPLAIN SELECT key FROM tbl GROUP BY key, substr(key,2,3); + +DROP TABLE tbl; diff --git a/ql/src/test/results/clientpositive/ql_rewrite_gbtoidx.q.out b/ql/src/test/results/clientpositive/ql_rewrite_gbtoidx.q.out new file mode 100644 index 0000000..0efd042 --- /dev/null +++ b/ql/src/test/results/clientpositive/ql_rewrite_gbtoidx.q.out @@ -0,0 +1,2743 @@ +PREHOOK: query: DROP TABLE lineitem +PREHOOK: type: DROPTABLE +POSTHOOK: query: DROP TABLE lineitem +POSTHOOK: type: DROPTABLE +PREHOOK: query: CREATE TABLE lineitem (L_ORDERKEY INT, + L_PARTKEY INT, + L_SUPPKEY INT, + L_LINENUMBER INT, + L_QUANTITY DOUBLE, + L_EXTENDEDPRICE DOUBLE, + L_DISCOUNT DOUBLE, + L_TAX DOUBLE, + L_RETURNFLAG STRING, + L_LINESTATUS STRING, + l_shipdate STRING, + L_COMMITDATE STRING, + L_RECEIPTDATE STRING, + L_SHIPINSTRUCT STRING, + L_SHIPMODE STRING, + L_COMMENT STRING) +ROW FORMAT DELIMITED +FIELDS TERMINATED BY '|' +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE lineitem (L_ORDERKEY INT, + L_PARTKEY INT, + L_SUPPKEY INT, + L_LINENUMBER INT, + L_QUANTITY DOUBLE, + L_EXTENDEDPRICE DOUBLE, + L_DISCOUNT DOUBLE, + L_TAX DOUBLE, + L_RETURNFLAG STRING, + L_LINESTATUS STRING, + l_shipdate STRING, + L_COMMITDATE STRING, + L_RECEIPTDATE STRING, + L_SHIPINSTRUCT STRING, + L_SHIPMODE STRING, + L_COMMENT STRING) +ROW FORMAT DELIMITED +FIELDS TERMINATED BY '|' +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@lineitem +PREHOOK: query: CREATE INDEX lineitem_lshipdate_idx ON TABLE lineitem(l_shipdate) AS 'org.apache.hadoop.hive.ql.index.compact.CompactIndexHandler' WITH DEFERRED REBUILD +PREHOOK: type: CREATEINDEX +POSTHOOK: query: CREATE INDEX lineitem_lshipdate_idx ON TABLE lineitem(l_shipdate) AS 'org.apache.hadoop.hive.ql.index.compact.CompactIndexHandler' WITH DEFERRED REBUILD +POSTHOOK: type: CREATEINDEX +PREHOOK: query: ALTER INDEX lineitem_lshipdate_idx ON lineitem REBUILD +PREHOOK: type: QUERY +PREHOOK: Input: default@lineitem +PREHOOK: Output: default@default__lineitem_lineitem_lshipdate_idx__ +POSTHOOK: query: ALTER INDEX lineitem_lshipdate_idx ON lineitem REBUILD +POSTHOOK: type: QUERY +POSTHOOK: Input: default@lineitem +POSTHOOK: Output: default@default__lineitem_lineitem_lshipdate_idx__ +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +PREHOOK: query: explain select l_shipdate, + count(1) +from +lineitem +group by l_shipdate +PREHOOK: type: QUERY +POSTHOOK: query: explain select l_shipdate, + count(1) +from +lineitem +group by l_shipdate +POSTHOOK: type: QUERY +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF lineitem)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL l_shipdate)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL l_shipdate)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + default__lineitem_lineitem_lshipdate_idx__ + TableScan + Select Operator + expressions: + expr: l_shipdate + type: string + expr: size(_offsets) + type: int + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: explain select lastyear.month, + thisyear.month, + (thisyear.monthly_shipments - lastyear.monthly_shipments) / +lastyear.monthly_shipments as monthly_shipments_delta + from (select year(l_shipdate) as year, + month(l_shipdate) as month, + count(1) as monthly_shipments + from lineitem + where year(l_shipdate) = 1997 + group by year(l_shipdate), month(l_shipdate) + ) lastyear join + (select year(l_shipdate) as year, + month(l_shipdate) as month, + count(1) as monthly_shipments + from lineitem + where year(l_shipdate) = 1998 + group by year(l_shipdate), month(l_shipdate) + ) thisyear + on lastyear.month = thisyear.month +PREHOOK: type: QUERY +POSTHOOK: query: explain select lastyear.month, + thisyear.month, + (thisyear.monthly_shipments - lastyear.monthly_shipments) / +lastyear.monthly_shipments as monthly_shipments_delta + from (select year(l_shipdate) as year, + month(l_shipdate) as month, + count(1) as monthly_shipments + from lineitem + where year(l_shipdate) = 1997 + group by year(l_shipdate), month(l_shipdate) + ) lastyear join + (select year(l_shipdate) as year, + month(l_shipdate) as month, + count(1) as monthly_shipments + from lineitem + where year(l_shipdate) = 1998 + group by year(l_shipdate), month(l_shipdate) + ) thisyear + on lastyear.month = thisyear.month +POSTHOOK: type: QUERY +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF lineitem)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTION year (TOK_TABLE_OR_COL l_shipdate)) year) (TOK_SELEXPR (TOK_FUNCTION month (TOK_TABLE_OR_COL l_shipdate)) month) (TOK_SELEXPR (TOK_FUNCTION count 1) monthly_shipments)) (TOK_WHERE (= (TOK_FUNCTION year (TOK_TABLE_OR_COL l_shipdate)) 1997)) (TOK_GROUPBY (TOK_FUNCTION year (TOK_TABLE_OR_COL l_shipdate)) (TOK_FUNCTION month (TOK_TABLE_OR_COL l_shipdate))))) lastyear) (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF lineitem)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTION year (TOK_TABLE_OR_COL l_shipdate)) year) (TOK_SELEXPR (TOK_FUNCTION month (TOK_TABLE_OR_COL l_shipdate)) month) (TOK_SELEXPR (TOK_FUNCTION count 1) monthly_shipments)) (TOK_WHERE (= (TOK_FUNCTION year (TOK_TABLE_OR_COL l_shipdate)) 1998)) (TOK_GROUPBY (TOK_FUNCTION year (TOK_TABLE_OR_COL l_shipdate)) (TOK_FUNCTION month (TOK_TABLE_OR_COL l_shipdate))))) thisyear) (= (. (TOK_TABLE_OR_COL lastyear) month) (. (TOK_TABLE_OR_COL thisyear) month)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL lastyear) month)) (TOK_SELEXPR (. (TOK_TABLE_OR_COL thisyear) month)) (TOK_SELEXPR (/ (- (. (TOK_TABLE_OR_COL thisyear) monthly_shipments) (. (TOK_TABLE_OR_COL lastyear) monthly_shipments)) (. (TOK_TABLE_OR_COL lastyear) monthly_shipments)) monthly_shipments_delta)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1, Stage-3 + Stage-3 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + lastyear:default__lineitem_lineitem_lshipdate_idx__ + TableScan + alias: default__lineitem_lineitem_lshipdate_idx__ + Filter Operator + predicate: + expr: (year(l_shipdate) = 1997) + type: boolean + Select Operator + expressions: + expr: l_shipdate + type: string + expr: size(_offsets) + type: int + outputColumnNames: _col0, _col1 + Filter Operator + predicate: + expr: (year(_col0) = 1997) + type: boolean + Select Operator + expressions: + expr: _col1 + type: int + expr: _col0 + type: string + outputColumnNames: _col0, _col1 + Group By Operator + aggregations: + expr: sum(_col1) + bucketGroup: false + keys: + expr: year(_col0) + type: int + expr: month(_col0) + type: int + mode: hash + outputColumnNames: _col0, _col1, _col2 + Reduce Output Operator + key expressions: + expr: _col0 + type: int + expr: _col1 + type: int + sort order: ++ + Map-reduce partition columns: + expr: _col0 + type: int + expr: _col1 + type: int + tag: -1 + value expressions: + expr: _col2 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: sum(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: int + expr: KEY._col1 + type: int + mode: mergepartial + outputColumnNames: _col0, _col1, _col2 + Select Operator + expressions: + expr: _col1 + type: int + expr: _col2 + type: bigint + outputColumnNames: _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: + $INTNAME + Reduce Output Operator + key expressions: + expr: _col1 + type: int + sort order: + + Map-reduce partition columns: + expr: _col1 + type: int + tag: 0 + value expressions: + expr: _col1 + type: int + expr: _col2 + type: bigint + $INTNAME1 + Reduce Output Operator + key expressions: + expr: _col1 + type: int + sort order: + + Map-reduce partition columns: + expr: _col1 + type: int + tag: 1 + value expressions: + expr: _col1 + type: int + expr: _col2 + type: bigint + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {VALUE._col1} {VALUE._col2} + 1 {VALUE._col1} {VALUE._col2} + handleSkewJoin: false + outputColumnNames: _col1, _col2, _col4, _col5 + Select Operator + expressions: + expr: _col1 + type: int + expr: _col4 + type: int + expr: ((_col5 - _col2) / _col2) + type: double + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-3 + Map Reduce + Alias -> Map Operator Tree: + thisyear:default__lineitem_lineitem_lshipdate_idx__ + TableScan + alias: default__lineitem_lineitem_lshipdate_idx__ + Filter Operator + predicate: + expr: (year(l_shipdate) = 1998) + type: boolean + Select Operator + expressions: + expr: l_shipdate + type: string + expr: size(_offsets) + type: int + outputColumnNames: _col0, _col1 + Filter Operator + predicate: + expr: (year(_col0) = 1998) + type: boolean + Select Operator + expressions: + expr: _col1 + type: int + expr: _col0 + type: string + outputColumnNames: _col0, _col1 + Group By Operator + aggregations: + expr: sum(_col1) + bucketGroup: false + keys: + expr: year(_col0) + type: int + expr: month(_col0) + type: int + mode: hash + outputColumnNames: _col0, _col1, _col2 + Reduce Output Operator + key expressions: + expr: _col0 + type: int + expr: _col1 + type: int + sort order: ++ + Map-reduce partition columns: + expr: _col0 + type: int + expr: _col1 + type: int + tag: -1 + value expressions: + expr: _col2 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: sum(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: int + expr: KEY._col1 + type: int + mode: mergepartial + outputColumnNames: _col0, _col1, _col2 + Select Operator + expressions: + expr: _col1 + type: int + expr: _col2 + type: bigint + outputColumnNames: _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: explain select year(l_shipdate) as year, + month(l_shipdate) as month, + count(1) as monthly_shipments +from lineitem +group by year(l_shipdate), month(l_shipdate) +PREHOOK: type: QUERY +POSTHOOK: query: explain select year(l_shipdate) as year, + month(l_shipdate) as month, + count(1) as monthly_shipments +from lineitem +group by year(l_shipdate), month(l_shipdate) +POSTHOOK: type: QUERY +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF lineitem)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTION year (TOK_TABLE_OR_COL l_shipdate)) year) (TOK_SELEXPR (TOK_FUNCTION month (TOK_TABLE_OR_COL l_shipdate)) month) (TOK_SELEXPR (TOK_FUNCTION count 1) monthly_shipments)) (TOK_GROUPBY (TOK_FUNCTION year (TOK_TABLE_OR_COL l_shipdate)) (TOK_FUNCTION month (TOK_TABLE_OR_COL l_shipdate))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + default__lineitem_lineitem_lshipdate_idx__ + TableScan + alias: default__lineitem_lineitem_lshipdate_idx__ + Select Operator + expressions: + expr: l_shipdate + type: string + expr: size(_offsets) + type: int + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col1 + type: int + expr: _col0 + type: string + outputColumnNames: _col0, _col1 + Group By Operator + aggregations: + expr: sum(_col1) + bucketGroup: false + keys: + expr: year(_col0) + type: int + expr: month(_col0) + type: int + mode: hash + outputColumnNames: _col0, _col1, _col2 + Reduce Output Operator + key expressions: + expr: _col0 + type: int + expr: _col1 + type: int + sort order: ++ + Map-reduce partition columns: + expr: _col0 + type: int + expr: _col1 + type: int + tag: -1 + value expressions: + expr: _col2 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: sum(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: int + expr: KEY._col1 + type: int + mode: mergepartial + outputColumnNames: _col0, _col1, _col2 + Select Operator + expressions: + expr: _col0 + type: int + expr: _col1 + type: int + expr: _col2 + type: bigint + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: explain select year(l_shipdate) as year, + month(l_shipdate) as month, + sum(sz) +from ( +select l_shipdate, size(`_offsets`) as sz +from default__lineitem_lineitem_lshipdate_idx__ +) t +group by year(l_shipdate), month(l_shipdate) +PREHOOK: type: QUERY +POSTHOOK: query: explain select year(l_shipdate) as year, + month(l_shipdate) as month, + sum(sz) +from ( +select l_shipdate, size(`_offsets`) as sz +from default__lineitem_lineitem_lshipdate_idx__ +) t +group by year(l_shipdate), month(l_shipdate) +POSTHOOK: type: QUERY +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF default__lineitem_lineitem_lshipdate_idx__)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL l_shipdate)) (TOK_SELEXPR (TOK_FUNCTION size (TOK_TABLE_OR_COL `_offsets`)) sz)))) t)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTION year (TOK_TABLE_OR_COL l_shipdate)) year) (TOK_SELEXPR (TOK_FUNCTION month (TOK_TABLE_OR_COL l_shipdate)) month) (TOK_SELEXPR (TOK_FUNCTION sum (TOK_TABLE_OR_COL sz)))) (TOK_GROUPBY (TOK_FUNCTION year (TOK_TABLE_OR_COL l_shipdate)) (TOK_FUNCTION month (TOK_TABLE_OR_COL l_shipdate))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + t:default__lineitem_lineitem_lshipdate_idx__ + TableScan + alias: default__lineitem_lineitem_lshipdate_idx__ + Select Operator + expressions: + expr: l_shipdate + type: string + expr: size(_offsets) + type: int + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: int + outputColumnNames: _col0, _col1 + Group By Operator + aggregations: + expr: sum(_col1) + bucketGroup: false + keys: + expr: year(_col0) + type: int + expr: month(_col0) + type: int + mode: hash + outputColumnNames: _col0, _col1, _col2 + Reduce Output Operator + key expressions: + expr: _col0 + type: int + expr: _col1 + type: int + sort order: ++ + Map-reduce partition columns: + expr: _col0 + type: int + expr: _col1 + type: int + tag: -1 + value expressions: + expr: _col2 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: sum(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: int + expr: KEY._col1 + type: int + mode: mergepartial + outputColumnNames: _col0, _col1, _col2 + Select Operator + expressions: + expr: _col0 + type: int + expr: _col1 + type: int + expr: _col2 + type: bigint + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: explain select year(l_shipdate) as year, + month(l_shipdate) as month, + count(1) as monthly_shipments +from lineitem +group by year(l_shipdate), month(l_shipdate) +PREHOOK: type: QUERY +POSTHOOK: query: explain select year(l_shipdate) as year, + month(l_shipdate) as month, + count(1) as monthly_shipments +from lineitem +group by year(l_shipdate), month(l_shipdate) +POSTHOOK: type: QUERY +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF lineitem)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTION year (TOK_TABLE_OR_COL l_shipdate)) year) (TOK_SELEXPR (TOK_FUNCTION month (TOK_TABLE_OR_COL l_shipdate)) month) (TOK_SELEXPR (TOK_FUNCTION count 1) monthly_shipments)) (TOK_GROUPBY (TOK_FUNCTION year (TOK_TABLE_OR_COL l_shipdate)) (TOK_FUNCTION month (TOK_TABLE_OR_COL l_shipdate))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + default__lineitem_lineitem_lshipdate_idx__ + TableScan + alias: default__lineitem_lineitem_lshipdate_idx__ + Select Operator + expressions: + expr: l_shipdate + type: string + expr: size(_offsets) + type: int + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col1 + type: int + expr: _col0 + type: string + outputColumnNames: _col0, _col1 + Group By Operator + aggregations: + expr: sum(_col1) + bucketGroup: false + keys: + expr: year(_col0) + type: int + expr: month(_col0) + type: int + mode: hash + outputColumnNames: _col0, _col1, _col2 + Reduce Output Operator + key expressions: + expr: _col0 + type: int + expr: _col1 + type: int + sort order: ++ + Map-reduce partition columns: + expr: _col0 + type: int + expr: _col1 + type: int + tag: -1 + value expressions: + expr: _col2 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: sum(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: int + expr: KEY._col1 + type: int + mode: mergepartial + outputColumnNames: _col0, _col1, _col2 + Select Operator + expressions: + expr: _col0 + type: int + expr: _col1 + type: int + expr: _col2 + type: bigint + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: explain select year(l_shipdate) as year, + month(l_shipdate) as month, + sum(sz) +from ( +select l_shipdate, size(`_offsets`) as sz +from default__lineitem_lineitem_lshipdate_idx__ +) t +group by year(l_shipdate), month(l_shipdate) +PREHOOK: type: QUERY +POSTHOOK: query: explain select year(l_shipdate) as year, + month(l_shipdate) as month, + sum(sz) +from ( +select l_shipdate, size(`_offsets`) as sz +from default__lineitem_lineitem_lshipdate_idx__ +) t +group by year(l_shipdate), month(l_shipdate) +POSTHOOK: type: QUERY +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF default__lineitem_lineitem_lshipdate_idx__)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL l_shipdate)) (TOK_SELEXPR (TOK_FUNCTION size (TOK_TABLE_OR_COL `_offsets`)) sz)))) t)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTION year (TOK_TABLE_OR_COL l_shipdate)) year) (TOK_SELEXPR (TOK_FUNCTION month (TOK_TABLE_OR_COL l_shipdate)) month) (TOK_SELEXPR (TOK_FUNCTION sum (TOK_TABLE_OR_COL sz)))) (TOK_GROUPBY (TOK_FUNCTION year (TOK_TABLE_OR_COL l_shipdate)) (TOK_FUNCTION month (TOK_TABLE_OR_COL l_shipdate))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + t:default__lineitem_lineitem_lshipdate_idx__ + TableScan + alias: default__lineitem_lineitem_lshipdate_idx__ + Select Operator + expressions: + expr: l_shipdate + type: string + expr: size(_offsets) + type: int + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: int + outputColumnNames: _col0, _col1 + Group By Operator + aggregations: + expr: sum(_col1) + bucketGroup: false + keys: + expr: year(_col0) + type: int + expr: month(_col0) + type: int + mode: hash + outputColumnNames: _col0, _col1, _col2 + Reduce Output Operator + key expressions: + expr: _col0 + type: int + expr: _col1 + type: int + sort order: ++ + Map-reduce partition columns: + expr: _col0 + type: int + expr: _col1 + type: int + tag: -1 + value expressions: + expr: _col2 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: sum(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: int + expr: KEY._col1 + type: int + mode: mergepartial + outputColumnNames: _col0, _col1, _col2 + Select Operator + expressions: + expr: _col0 + type: int + expr: _col1 + type: int + expr: _col2 + type: bigint + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: explain select year(L_SHIPDATE), month(L_SHIPDATE) as month_bkt, COUNT(1) + from lineitem +group by year(L_SHIPDATE), month(L_SHIPDATE) +PREHOOK: type: QUERY +POSTHOOK: query: explain select year(L_SHIPDATE), month(L_SHIPDATE) as month_bkt, COUNT(1) + from lineitem +group by year(L_SHIPDATE), month(L_SHIPDATE) +POSTHOOK: type: QUERY +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF lineitem)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTION year (TOK_TABLE_OR_COL L_SHIPDATE))) (TOK_SELEXPR (TOK_FUNCTION month (TOK_TABLE_OR_COL L_SHIPDATE)) month_bkt) (TOK_SELEXPR (TOK_FUNCTION COUNT 1))) (TOK_GROUPBY (TOK_FUNCTION year (TOK_TABLE_OR_COL L_SHIPDATE)) (TOK_FUNCTION month (TOK_TABLE_OR_COL L_SHIPDATE))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + default__lineitem_lineitem_lshipdate_idx__ + TableScan + alias: default__lineitem_lineitem_lshipdate_idx__ + Select Operator + expressions: + expr: l_shipdate + type: string + expr: size(_offsets) + type: int + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col1 + type: int + expr: _col0 + type: string + outputColumnNames: _col0, _col1 + Group By Operator + aggregations: + expr: sum(_col1) + bucketGroup: false + keys: + expr: year(_col0) + type: int + expr: month(_col0) + type: int + mode: hash + outputColumnNames: _col0, _col1, _col2 + Reduce Output Operator + key expressions: + expr: _col0 + type: int + expr: _col1 + type: int + sort order: ++ + Map-reduce partition columns: + expr: _col0 + type: int + expr: _col1 + type: int + tag: -1 + value expressions: + expr: _col2 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: sum(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: int + expr: KEY._col1 + type: int + mode: mergepartial + outputColumnNames: _col0, _col1, _col2 + Select Operator + expressions: + expr: _col0 + type: int + expr: _col1 + type: int + expr: _col2 + type: bigint + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: explain select lastyear.month, + thisyear.month, + (thisyear.monthly_shipments - lastyear.monthly_shipments) / +lastyear.monthly_shipments as monthly_shipments_delta + from (select year(l_shipdate) as year, + month(l_shipdate) as month, + count(1) as monthly_shipments + from lineitem + where year(l_shipdate) = 1997 + group by year(l_shipdate), month(l_shipdate) + ) lastyear join + (select year(l_shipdate) as year, + month(l_shipdate) as month, + count(1) as monthly_shipments + from lineitem + where year(l_shipdate) = 1998 + group by year(l_shipdate), month(l_shipdate) + ) thisyear + on lastyear.month = thisyear.month + and lastyear.year = thisyear.year +PREHOOK: type: QUERY +POSTHOOK: query: explain select lastyear.month, + thisyear.month, + (thisyear.monthly_shipments - lastyear.monthly_shipments) / +lastyear.monthly_shipments as monthly_shipments_delta + from (select year(l_shipdate) as year, + month(l_shipdate) as month, + count(1) as monthly_shipments + from lineitem + where year(l_shipdate) = 1997 + group by year(l_shipdate), month(l_shipdate) + ) lastyear join + (select year(l_shipdate) as year, + month(l_shipdate) as month, + count(1) as monthly_shipments + from lineitem + where year(l_shipdate) = 1998 + group by year(l_shipdate), month(l_shipdate) + ) thisyear + on lastyear.month = thisyear.month + and lastyear.year = thisyear.year +POSTHOOK: type: QUERY +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF lineitem)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTION year (TOK_TABLE_OR_COL l_shipdate)) year) (TOK_SELEXPR (TOK_FUNCTION month (TOK_TABLE_OR_COL l_shipdate)) month) (TOK_SELEXPR (TOK_FUNCTION count 1) monthly_shipments)) (TOK_WHERE (= (TOK_FUNCTION year (TOK_TABLE_OR_COL l_shipdate)) 1997)) (TOK_GROUPBY (TOK_FUNCTION year (TOK_TABLE_OR_COL l_shipdate)) (TOK_FUNCTION month (TOK_TABLE_OR_COL l_shipdate))))) lastyear) (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF lineitem)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTION year (TOK_TABLE_OR_COL l_shipdate)) year) (TOK_SELEXPR (TOK_FUNCTION month (TOK_TABLE_OR_COL l_shipdate)) month) (TOK_SELEXPR (TOK_FUNCTION count 1) monthly_shipments)) (TOK_WHERE (= (TOK_FUNCTION year (TOK_TABLE_OR_COL l_shipdate)) 1998)) (TOK_GROUPBY (TOK_FUNCTION year (TOK_TABLE_OR_COL l_shipdate)) (TOK_FUNCTION month (TOK_TABLE_OR_COL l_shipdate))))) thisyear) (and (= (. (TOK_TABLE_OR_COL lastyear) month) (. (TOK_TABLE_OR_COL thisyear) month)) (= (. (TOK_TABLE_OR_COL lastyear) year) (. (TOK_TABLE_OR_COL thisyear) year))))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL lastyear) month)) (TOK_SELEXPR (. (TOK_TABLE_OR_COL thisyear) month)) (TOK_SELEXPR (/ (- (. (TOK_TABLE_OR_COL thisyear) monthly_shipments) (. (TOK_TABLE_OR_COL lastyear) monthly_shipments)) (. (TOK_TABLE_OR_COL lastyear) monthly_shipments)) monthly_shipments_delta)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1, Stage-3 + Stage-3 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + lastyear:default__lineitem_lineitem_lshipdate_idx__ + TableScan + alias: default__lineitem_lineitem_lshipdate_idx__ + Filter Operator + predicate: + expr: (year(l_shipdate) = 1997) + type: boolean + Select Operator + expressions: + expr: l_shipdate + type: string + expr: size(_offsets) + type: int + outputColumnNames: _col0, _col1 + Filter Operator + predicate: + expr: (year(_col0) = 1997) + type: boolean + Select Operator + expressions: + expr: _col1 + type: int + expr: _col0 + type: string + outputColumnNames: _col0, _col1 + Group By Operator + aggregations: + expr: sum(_col1) + bucketGroup: false + keys: + expr: year(_col0) + type: int + expr: month(_col0) + type: int + mode: hash + outputColumnNames: _col0, _col1, _col2 + Reduce Output Operator + key expressions: + expr: _col0 + type: int + expr: _col1 + type: int + sort order: ++ + Map-reduce partition columns: + expr: _col0 + type: int + expr: _col1 + type: int + tag: -1 + value expressions: + expr: _col2 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: sum(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: int + expr: KEY._col1 + type: int + mode: mergepartial + outputColumnNames: _col0, _col1, _col2 + Select Operator + expressions: + expr: _col0 + type: int + expr: _col1 + type: int + expr: _col2 + type: bigint + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: + $INTNAME + Reduce Output Operator + key expressions: + expr: _col1 + type: int + expr: _col0 + type: int + sort order: ++ + Map-reduce partition columns: + expr: _col1 + type: int + expr: _col0 + type: int + tag: 0 + value expressions: + expr: _col1 + type: int + expr: _col2 + type: bigint + $INTNAME1 + Reduce Output Operator + key expressions: + expr: _col1 + type: int + expr: _col0 + type: int + sort order: ++ + Map-reduce partition columns: + expr: _col1 + type: int + expr: _col0 + type: int + tag: 1 + value expressions: + expr: _col1 + type: int + expr: _col2 + type: bigint + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {VALUE._col1} {VALUE._col2} + 1 {VALUE._col1} {VALUE._col2} + handleSkewJoin: false + outputColumnNames: _col1, _col2, _col4, _col5 + Select Operator + expressions: + expr: _col1 + type: int + expr: _col4 + type: int + expr: ((_col5 - _col2) / _col2) + type: double + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-3 + Map Reduce + Alias -> Map Operator Tree: + thisyear:default__lineitem_lineitem_lshipdate_idx__ + TableScan + alias: default__lineitem_lineitem_lshipdate_idx__ + Filter Operator + predicate: + expr: (year(l_shipdate) = 1998) + type: boolean + Select Operator + expressions: + expr: l_shipdate + type: string + expr: size(_offsets) + type: int + outputColumnNames: _col0, _col1 + Filter Operator + predicate: + expr: (year(_col0) = 1998) + type: boolean + Select Operator + expressions: + expr: _col1 + type: int + expr: _col0 + type: string + outputColumnNames: _col0, _col1 + Group By Operator + aggregations: + expr: sum(_col1) + bucketGroup: false + keys: + expr: year(_col0) + type: int + expr: month(_col0) + type: int + mode: hash + outputColumnNames: _col0, _col1, _col2 + Reduce Output Operator + key expressions: + expr: _col0 + type: int + expr: _col1 + type: int + sort order: ++ + Map-reduce partition columns: + expr: _col0 + type: int + expr: _col1 + type: int + tag: -1 + value expressions: + expr: _col2 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: sum(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: int + expr: KEY._col1 + type: int + mode: mergepartial + outputColumnNames: _col0, _col1, _col2 + Select Operator + expressions: + expr: _col0 + type: int + expr: _col1 + type: int + expr: _col2 + type: bigint + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: DROP TABLE tbl +PREHOOK: type: DROPTABLE +POSTHOOK: query: DROP TABLE tbl +POSTHOOK: type: DROPTABLE +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +PREHOOK: query: CREATE TABLE tbl(key int, value int) +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE tbl(key int, value int) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@tbl +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +PREHOOK: query: CREATE INDEX tbl_key_idx ON TABLE tbl(key) AS 'org.apache.hadoop.hive.ql.index.compact.CompactIndexHandler' WITH DEFERRED REBUILD +PREHOOK: type: CREATEINDEX +POSTHOOK: query: CREATE INDEX tbl_key_idx ON TABLE tbl(key) AS 'org.apache.hadoop.hive.ql.index.compact.CompactIndexHandler' WITH DEFERRED REBUILD +POSTHOOK: type: CREATEINDEX +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +PREHOOK: query: ALTER INDEX tbl_key_idx ON tbl REBUILD +PREHOOK: type: QUERY +PREHOOK: Input: default@tbl +PREHOOK: Output: default@default__tbl_tbl_key_idx__ +POSTHOOK: query: ALTER INDEX tbl_key_idx ON tbl REBUILD +POSTHOOK: type: QUERY +POSTHOOK: Input: default@tbl +POSTHOOK: Output: default@default__tbl_tbl_key_idx__ +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._bucketname SIMPLE [(tbl)tbl.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._offsets EXPRESSION [(tbl)tbl.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__.key SIMPLE [(tbl)tbl.FieldSchema(name:key, type:int, comment:null), ] +PREHOOK: query: EXPLAIN select key, count(key) from tbl where key = 1 group by key +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN select key, count(key) from tbl where key = 1 group by key +POSTHOOK: type: QUERY +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._bucketname SIMPLE [(tbl)tbl.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._offsets EXPRESSION [(tbl)tbl.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__.key SIMPLE [(tbl)tbl.FieldSchema(name:key, type:int, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF tbl)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count (TOK_TABLE_OR_COL key)))) (TOK_WHERE (= (TOK_TABLE_OR_COL key) 1)) (TOK_GROUPBY (TOK_TABLE_OR_COL key)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + default__tbl_tbl_key_idx__ + TableScan + Filter Operator + predicate: + expr: (key = 1) + type: boolean + Filter Operator + predicate: + expr: (key = 1) + type: boolean + Select Operator + expressions: + expr: key + type: int + expr: size(_offsets) + type: int + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: EXPLAIN SELECT DISTINCT key FROM tbl +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN SELECT DISTINCT key FROM tbl +POSTHOOK: type: QUERY +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._bucketname SIMPLE [(tbl)tbl.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._offsets EXPRESSION [(tbl)tbl.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__.key SIMPLE [(tbl)tbl.FieldSchema(name:key, type:int, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF tbl)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECTDI (TOK_SELEXPR (TOK_TABLE_OR_COL key))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + default__tbl_tbl_key_idx__ + TableScan + Select Operator + expressions: + expr: key + type: int + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: EXPLAIN select count(1) from tbl +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN select count(1) from tbl +POSTHOOK: type: QUERY +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._bucketname SIMPLE [(tbl)tbl.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._offsets EXPRESSION [(tbl)tbl.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__.key SIMPLE [(tbl)tbl.FieldSchema(name:key, type:int, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF tbl)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTION count 1))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + tbl + TableScan + alias: tbl + Select Operator + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: bigint + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: EXPLAIN select key, count(key) from tbl group by key +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN select key, count(key) from tbl group by key +POSTHOOK: type: QUERY +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._bucketname SIMPLE [(tbl)tbl.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._offsets EXPRESSION [(tbl)tbl.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__.key SIMPLE [(tbl)tbl.FieldSchema(name:key, type:int, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF tbl)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count (TOK_TABLE_OR_COL key)))) (TOK_GROUPBY (TOK_TABLE_OR_COL key)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + default__tbl_tbl_key_idx__ + TableScan + Select Operator + expressions: + expr: key + type: int + expr: size(_offsets) + type: int + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: EXPLAIN select count(key) from tbl +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN select count(key) from tbl +POSTHOOK: type: QUERY +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._bucketname SIMPLE [(tbl)tbl.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._offsets EXPRESSION [(tbl)tbl.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__.key SIMPLE [(tbl)tbl.FieldSchema(name:key, type:int, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF tbl)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTION count (TOK_TABLE_OR_COL key)))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + tbl + TableScan + alias: tbl + Select Operator + expressions: + expr: key + type: int + outputColumnNames: key + Group By Operator + aggregations: + expr: count(key) + bucketGroup: false + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: bigint + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: EXPLAIN SELECT DISTINCT key FROM tbl +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN SELECT DISTINCT key FROM tbl +POSTHOOK: type: QUERY +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._bucketname SIMPLE [(tbl)tbl.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._offsets EXPRESSION [(tbl)tbl.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__.key SIMPLE [(tbl)tbl.FieldSchema(name:key, type:int, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF tbl)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECTDI (TOK_SELEXPR (TOK_TABLE_OR_COL key))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + default__tbl_tbl_key_idx__ + TableScan + Select Operator + expressions: + expr: key + type: int + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: EXPLAIN SELECT key FROM tbl GROUP BY key +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN SELECT key FROM tbl GROUP BY key +POSTHOOK: type: QUERY +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._bucketname SIMPLE [(tbl)tbl.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._offsets EXPRESSION [(tbl)tbl.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__.key SIMPLE [(tbl)tbl.FieldSchema(name:key, type:int, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF tbl)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key))) (TOK_GROUPBY (TOK_TABLE_OR_COL key)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + default__tbl_tbl_key_idx__ + TableScan + Select Operator + expressions: + expr: key + type: int + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: EXPLAIN SELECT DISTINCT key FROM tbl +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN SELECT DISTINCT key FROM tbl +POSTHOOK: type: QUERY +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._bucketname SIMPLE [(tbl)tbl.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._offsets EXPRESSION [(tbl)tbl.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__.key SIMPLE [(tbl)tbl.FieldSchema(name:key, type:int, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF tbl)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECTDI (TOK_SELEXPR (TOK_TABLE_OR_COL key))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + default__tbl_tbl_key_idx__ + TableScan + Select Operator + expressions: + expr: key + type: int + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: EXPLAIN SELECT DISTINCT key, value FROM tbl +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN SELECT DISTINCT key, value FROM tbl +POSTHOOK: type: QUERY +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._bucketname SIMPLE [(tbl)tbl.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._offsets EXPRESSION [(tbl)tbl.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__.key SIMPLE [(tbl)tbl.FieldSchema(name:key, type:int, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF tbl)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECTDI (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL value))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + tbl + TableScan + alias: tbl + Select Operator + expressions: + expr: key + type: int + expr: value + type: int + outputColumnNames: key, value + Group By Operator + bucketGroup: false + keys: + expr: key + type: int + expr: value + type: int + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: int + expr: _col1 + type: int + sort order: ++ + Map-reduce partition columns: + expr: _col0 + type: int + expr: _col1 + type: int + tag: -1 + Reduce Operator Tree: + Group By Operator + bucketGroup: false + keys: + expr: KEY._col0 + type: int + expr: KEY._col1 + type: int + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: int + expr: _col1 + type: int + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: EXPLAIN SELECT key FROM tbl GROUP BY key +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN SELECT key FROM tbl GROUP BY key +POSTHOOK: type: QUERY +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._bucketname SIMPLE [(tbl)tbl.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._offsets EXPRESSION [(tbl)tbl.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__.key SIMPLE [(tbl)tbl.FieldSchema(name:key, type:int, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF tbl)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key))) (TOK_GROUPBY (TOK_TABLE_OR_COL key)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + default__tbl_tbl_key_idx__ + TableScan + Select Operator + expressions: + expr: key + type: int + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: EXPLAIN SELECT key FROM tbl GROUP BY value, key +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN SELECT key FROM tbl GROUP BY value, key +POSTHOOK: type: QUERY +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._bucketname SIMPLE [(tbl)tbl.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._offsets EXPRESSION [(tbl)tbl.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__.key SIMPLE [(tbl)tbl.FieldSchema(name:key, type:int, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF tbl)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key))) (TOK_GROUPBY (TOK_TABLE_OR_COL value) (TOK_TABLE_OR_COL key)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + tbl + TableScan + alias: tbl + Select Operator + expressions: + expr: value + type: int + expr: key + type: int + outputColumnNames: value, key + Group By Operator + bucketGroup: false + keys: + expr: value + type: int + expr: key + type: int + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: int + expr: _col1 + type: int + sort order: ++ + Map-reduce partition columns: + expr: _col0 + type: int + expr: _col1 + type: int + tag: -1 + Reduce Operator Tree: + Group By Operator + bucketGroup: false + keys: + expr: KEY._col0 + type: int + expr: KEY._col1 + type: int + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col1 + type: int + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: EXPLAIN SELECT key, value FROM tbl GROUP BY value, key +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN SELECT key, value FROM tbl GROUP BY value, key +POSTHOOK: type: QUERY +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._bucketname SIMPLE [(tbl)tbl.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._offsets EXPRESSION [(tbl)tbl.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__.key SIMPLE [(tbl)tbl.FieldSchema(name:key, type:int, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF tbl)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL value))) (TOK_GROUPBY (TOK_TABLE_OR_COL value) (TOK_TABLE_OR_COL key)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + tbl + TableScan + alias: tbl + Select Operator + expressions: + expr: value + type: int + expr: key + type: int + outputColumnNames: value, key + Group By Operator + bucketGroup: false + keys: + expr: value + type: int + expr: key + type: int + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: int + expr: _col1 + type: int + sort order: ++ + Map-reduce partition columns: + expr: _col0 + type: int + expr: _col1 + type: int + tag: -1 + Reduce Operator Tree: + Group By Operator + bucketGroup: false + keys: + expr: KEY._col0 + type: int + expr: KEY._col1 + type: int + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col1 + type: int + expr: _col0 + type: int + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: EXPLAIN SELECT DISTINCT key, value FROM tbl WHERE value = 2 +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN SELECT DISTINCT key, value FROM tbl WHERE value = 2 +POSTHOOK: type: QUERY +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._bucketname SIMPLE [(tbl)tbl.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._offsets EXPRESSION [(tbl)tbl.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__.key SIMPLE [(tbl)tbl.FieldSchema(name:key, type:int, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF tbl)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECTDI (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL value))) (TOK_WHERE (= (TOK_TABLE_OR_COL value) 2)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + tbl + TableScan + alias: tbl + Filter Operator + predicate: + expr: (value = 2) + type: boolean + Filter Operator + predicate: + expr: (value = 2) + type: boolean + Select Operator + expressions: + expr: key + type: int + expr: value + type: int + outputColumnNames: key, value + Group By Operator + bucketGroup: false + keys: + expr: key + type: int + expr: value + type: int + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: int + expr: _col1 + type: int + sort order: ++ + Map-reduce partition columns: + expr: _col0 + type: int + expr: _col1 + type: int + tag: -1 + Reduce Operator Tree: + Group By Operator + bucketGroup: false + keys: + expr: KEY._col0 + type: int + expr: KEY._col1 + type: int + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: int + expr: _col1 + type: int + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: EXPLAIN SELECT DISTINCT key, value FROM tbl WHERE value = 2 AND key = 3 +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN SELECT DISTINCT key, value FROM tbl WHERE value = 2 AND key = 3 +POSTHOOK: type: QUERY +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._bucketname SIMPLE [(tbl)tbl.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._offsets EXPRESSION [(tbl)tbl.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__.key SIMPLE [(tbl)tbl.FieldSchema(name:key, type:int, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF tbl)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECTDI (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL value))) (TOK_WHERE (AND (= (TOK_TABLE_OR_COL value) 2) (= (TOK_TABLE_OR_COL key) 3))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + tbl + TableScan + alias: tbl + Filter Operator + predicate: + expr: ((value = 2) and (key = 3)) + type: boolean + Filter Operator + predicate: + expr: ((value = 2) and (key = 3)) + type: boolean + Select Operator + expressions: + expr: key + type: int + expr: value + type: int + outputColumnNames: key, value + Group By Operator + bucketGroup: false + keys: + expr: key + type: int + expr: value + type: int + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: int + expr: _col1 + type: int + sort order: ++ + Map-reduce partition columns: + expr: _col0 + type: int + expr: _col1 + type: int + tag: -1 + Reduce Operator Tree: + Group By Operator + bucketGroup: false + keys: + expr: KEY._col0 + type: int + expr: KEY._col1 + type: int + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: int + expr: _col1 + type: int + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: EXPLAIN SELECT DISTINCT key, value FROM tbl WHERE value = key +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN SELECT DISTINCT key, value FROM tbl WHERE value = key +POSTHOOK: type: QUERY +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._bucketname SIMPLE [(tbl)tbl.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._offsets EXPRESSION [(tbl)tbl.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__.key SIMPLE [(tbl)tbl.FieldSchema(name:key, type:int, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF tbl)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECTDI (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL value))) (TOK_WHERE (= (TOK_TABLE_OR_COL value) (TOK_TABLE_OR_COL key))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + tbl + TableScan + alias: tbl + Filter Operator + predicate: + expr: (value = key) + type: boolean + Filter Operator + predicate: + expr: (value = key) + type: boolean + Select Operator + expressions: + expr: key + type: int + expr: value + type: int + outputColumnNames: key, value + Group By Operator + bucketGroup: false + keys: + expr: key + type: int + expr: value + type: int + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: int + expr: _col1 + type: int + sort order: ++ + Map-reduce partition columns: + expr: _col0 + type: int + expr: _col1 + type: int + tag: -1 + Reduce Operator Tree: + Group By Operator + bucketGroup: false + keys: + expr: KEY._col0 + type: int + expr: KEY._col1 + type: int + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: int + expr: _col1 + type: int + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: EXPLAIN SELECT key FROM tbl WHERE key = 3 GROUP BY key +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN SELECT key FROM tbl WHERE key = 3 GROUP BY key +POSTHOOK: type: QUERY +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._bucketname SIMPLE [(tbl)tbl.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._offsets EXPRESSION [(tbl)tbl.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__.key SIMPLE [(tbl)tbl.FieldSchema(name:key, type:int, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF tbl)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key))) (TOK_WHERE (= (TOK_TABLE_OR_COL key) 3)) (TOK_GROUPBY (TOK_TABLE_OR_COL key)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + default__tbl_tbl_key_idx__ + TableScan + Filter Operator + predicate: + expr: (key = 3) + type: boolean + Filter Operator + predicate: + expr: (key = 3) + type: boolean + Select Operator + expressions: + expr: key + type: int + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: EXPLAIN SELECT key, value FROM tbl WHERE value = 1 GROUP BY key, value +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN SELECT key, value FROM tbl WHERE value = 1 GROUP BY key, value +POSTHOOK: type: QUERY +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._bucketname SIMPLE [(tbl)tbl.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._offsets EXPRESSION [(tbl)tbl.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__.key SIMPLE [(tbl)tbl.FieldSchema(name:key, type:int, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF tbl)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL value))) (TOK_WHERE (= (TOK_TABLE_OR_COL value) 1)) (TOK_GROUPBY (TOK_TABLE_OR_COL key) (TOK_TABLE_OR_COL value)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + tbl + TableScan + alias: tbl + Filter Operator + predicate: + expr: (value = 1) + type: boolean + Filter Operator + predicate: + expr: (value = 1) + type: boolean + Select Operator + expressions: + expr: key + type: int + expr: value + type: int + outputColumnNames: key, value + Group By Operator + bucketGroup: false + keys: + expr: key + type: int + expr: value + type: int + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: int + expr: _col1 + type: int + sort order: ++ + Map-reduce partition columns: + expr: _col0 + type: int + expr: _col1 + type: int + tag: -1 + Reduce Operator Tree: + Group By Operator + bucketGroup: false + keys: + expr: KEY._col0 + type: int + expr: KEY._col1 + type: int + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: int + expr: _col1 + type: int + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: EXPLAIN SELECT * FROM (SELECT DISTINCT key, value FROM tbl) v1 WHERE v1.value = 2 +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN SELECT * FROM (SELECT DISTINCT key, value FROM tbl) v1 WHERE v1.value = 2 +POSTHOOK: type: QUERY +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._bucketname SIMPLE [(tbl)tbl.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._offsets EXPRESSION [(tbl)tbl.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__.key SIMPLE [(tbl)tbl.FieldSchema(name:key, type:int, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF tbl)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECTDI (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL value))))) v1)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)) (TOK_WHERE (= (. (TOK_TABLE_OR_COL v1) value) 2)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + v1:tbl + TableScan + alias: tbl + Filter Operator + predicate: + expr: (value = 2) + type: boolean + Select Operator + expressions: + expr: key + type: int + expr: value + type: int + outputColumnNames: key, value + Group By Operator + bucketGroup: false + keys: + expr: key + type: int + expr: value + type: int + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: int + expr: _col1 + type: int + sort order: ++ + Map-reduce partition columns: + expr: _col0 + type: int + expr: _col1 + type: int + tag: -1 + Reduce Operator Tree: + Group By Operator + bucketGroup: false + keys: + expr: KEY._col0 + type: int + expr: KEY._col1 + type: int + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: int + expr: _col1 + type: int + outputColumnNames: _col0, _col1 + Filter Operator + predicate: + expr: (_col1 = 2) + type: boolean + Select Operator + expressions: + expr: _col0 + type: int + expr: _col1 + type: int + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: EXPLAIN SELECT key FROM tbl WHERE value = 2 GROUP BY key +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN SELECT key FROM tbl WHERE value = 2 GROUP BY key +POSTHOOK: type: QUERY +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._bucketname SIMPLE [(tbl)tbl.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._offsets EXPRESSION [(tbl)tbl.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__.key SIMPLE [(tbl)tbl.FieldSchema(name:key, type:int, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF tbl)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key))) (TOK_WHERE (= (TOK_TABLE_OR_COL value) 2)) (TOK_GROUPBY (TOK_TABLE_OR_COL key)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + tbl + TableScan + alias: tbl + Filter Operator + predicate: + expr: (value = 2) + type: boolean + Filter Operator + predicate: + expr: (value = 2) + type: boolean + Select Operator + expressions: + expr: key + type: int + outputColumnNames: key + Group By Operator + bucketGroup: false + keys: + expr: key + type: int + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + key expressions: + expr: _col0 + type: int + sort order: + + Map-reduce partition columns: + expr: _col0 + type: int + tag: -1 + Reduce Operator Tree: + Group By Operator + bucketGroup: false + keys: + expr: KEY._col0 + type: int + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: int + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: EXPLAIN SELECT DISTINCT key, substr(value,2,3) FROM tbl WHERE value = key +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN SELECT DISTINCT key, substr(value,2,3) FROM tbl WHERE value = key +POSTHOOK: type: QUERY +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._bucketname SIMPLE [(tbl)tbl.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._offsets EXPRESSION [(tbl)tbl.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__.key SIMPLE [(tbl)tbl.FieldSchema(name:key, type:int, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF tbl)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECTDI (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION substr (TOK_TABLE_OR_COL value) 2 3))) (TOK_WHERE (= (TOK_TABLE_OR_COL value) (TOK_TABLE_OR_COL key))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + tbl + TableScan + alias: tbl + Filter Operator + predicate: + expr: (value = key) + type: boolean + Filter Operator + predicate: + expr: (value = key) + type: boolean + Select Operator + expressions: + expr: key + type: int + expr: value + type: int + outputColumnNames: key, value + Group By Operator + bucketGroup: false + keys: + expr: key + type: int + expr: substr(value, 2, 3) + type: string + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: int + expr: _col1 + type: string + sort order: ++ + Map-reduce partition columns: + expr: _col0 + type: int + expr: _col1 + type: string + tag: -1 + Reduce Operator Tree: + Group By Operator + bucketGroup: false + keys: + expr: KEY._col0 + type: int + expr: KEY._col1 + type: string + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: int + expr: _col1 + type: string + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: EXPLAIN SELECT DISTINCT key, substr(value,2,3) FROM tbl +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN SELECT DISTINCT key, substr(value,2,3) FROM tbl +POSTHOOK: type: QUERY +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._bucketname SIMPLE [(tbl)tbl.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._offsets EXPRESSION [(tbl)tbl.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__.key SIMPLE [(tbl)tbl.FieldSchema(name:key, type:int, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF tbl)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECTDI (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION substr (TOK_TABLE_OR_COL value) 2 3))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + tbl + TableScan + alias: tbl + Select Operator + expressions: + expr: key + type: int + expr: value + type: int + outputColumnNames: key, value + Group By Operator + bucketGroup: false + keys: + expr: key + type: int + expr: substr(value, 2, 3) + type: string + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: int + expr: _col1 + type: string + sort order: ++ + Map-reduce partition columns: + expr: _col0 + type: int + expr: _col1 + type: string + tag: -1 + Reduce Operator Tree: + Group By Operator + bucketGroup: false + keys: + expr: KEY._col0 + type: int + expr: KEY._col1 + type: string + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: int + expr: _col1 + type: string + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: EXPLAIN SELECT key FROM tbl GROUP BY key, substr(key,2,3) +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN SELECT key FROM tbl GROUP BY key, substr(key,2,3) +POSTHOOK: type: QUERY +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._bucketname SIMPLE [(tbl)tbl.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._offsets EXPRESSION [(tbl)tbl.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__.key SIMPLE [(tbl)tbl.FieldSchema(name:key, type:int, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF tbl)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key))) (TOK_GROUPBY (TOK_TABLE_OR_COL key) (TOK_FUNCTION substr (TOK_TABLE_OR_COL key) 2 3)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + tbl + TableScan + alias: tbl + Select Operator + expressions: + expr: key + type: int + outputColumnNames: key + Group By Operator + bucketGroup: false + keys: + expr: key + type: int + expr: substr(key, 2, 3) + type: string + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: int + expr: _col1 + type: string + sort order: ++ + Map-reduce partition columns: + expr: _col0 + type: int + expr: _col1 + type: string + tag: -1 + Reduce Operator Tree: + Group By Operator + bucketGroup: false + keys: + expr: KEY._col0 + type: int + expr: KEY._col1 + type: string + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: int + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: DROP TABLE tbl +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@tbl +PREHOOK: Output: default@tbl +POSTHOOK: query: DROP TABLE tbl +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@tbl +POSTHOOK: Output: default@tbl +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._bucketname SIMPLE [(tbl)tbl.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._offsets EXPRESSION [(tbl)tbl.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__.key SIMPLE [(tbl)tbl.FieldSchema(name:key, type:int, comment:null), ]