diff --git a/build.xml b/build.xml
index f41db23..3e94bca 100644
--- a/build.xml
+++ b/build.xml
@@ -52,6 +52,7 @@
+
diff --git a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
index 7e5e19f..e45f7e3 100644
--- a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
+++ b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
@@ -345,6 +345,10 @@ public class HiveConf extends Configuration {
// For har files
HIVEARCHIVEENABLED("hive.archive.enabled", false),
HIVEHARPARENTDIRSETTABLE("hive.archive.har.parentdir.settable", false),
+
+ //Enable/Disable gbToIdx rewrite rule
+ HIVEOPTGBYUSINGINDEX("hive.optimize.gbyusingindex", false),
+
HIVEOUTERJOINSUPPORTSFILTERS("hive.outerjoin.supports.filters", true),
// Serde for FetchTask
@@ -368,7 +372,7 @@ public class HiveConf extends Configuration {
HIVE_ERROR_ON_EMPTY_PARTITION("hive.error.on.empty.partition", false),
- HIVE_INDEX_IGNORE_HDFS_LOC("hive.index.compact.file.ignore.hdfs", false),
+ HIVE_INDEX_IGNORE_HDFS_LOC("hive.index.compact.file.ignore.hdfs", false),
;
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/metadata/..orig b/ql/src/java/org/apache/hadoop/hive/ql/metadata/..orig
new file mode 100644
index 0000000..e69de29
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java b/ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java
index 5f78082..a065da9 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java
@@ -665,6 +665,16 @@ public class Hive {
throw new HiveException(e);
}
}
+ public List getIndexesOnTable(String db_name, String tbl_name,
+ short max) throws HiveException {
+ try {
+ return getMSC().listIndexes(db_name, tbl_name, max);
+ } catch (NoSuchObjectException e) {
+ throw new HiveException("Partition or table doesn't exist.", e);
+ } catch (Exception e) {
+ throw new HiveException("Unknow error. Please check logs.", e);
+ }
+ }
public boolean dropIndex(String db_name, String tbl_name, String index_name, boolean deleteData) throws HiveException {
try {
@@ -1476,7 +1486,7 @@ public class Hive {
throw new HiveException(e);
}
}
-
+
/**
* Get all existing role names.
*
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/metadata/Table.java b/ql/src/java/org/apache/hadoop/hive/ql/metadata/Table.java
index c55a4ec..5e4a22c 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/metadata/Table.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/metadata/Table.java
@@ -38,6 +38,7 @@ import org.apache.hadoop.hive.metastore.MetaStoreUtils;
import org.apache.hadoop.hive.metastore.ProtectMode;
import org.apache.hadoop.hive.metastore.TableType;
import org.apache.hadoop.hive.metastore.api.FieldSchema;
+import org.apache.hadoop.hive.metastore.api.Index;
import org.apache.hadoop.hive.metastore.api.MetaException;
import org.apache.hadoop.hive.metastore.api.Order;
import org.apache.hadoop.hive.metastore.api.SerDeInfo;
@@ -805,4 +806,15 @@ public class Table implements Serializable {
public String getCompleteName() {
return getDbName() + "@" + getTableName();
}
+
+ /**
+ * @return List containing Index Table names if there is exists indexes
+ * on this table
+ * @throws HiveException
+ **/
+ public List getAllIndexes(short max) throws HiveException {
+ Hive hive = Hive.get();
+ return hive.getIndexesOnTable(getTTable().getDbName(), getTTable().getTableName(), max);
+ }
+
};
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java
index 590d69a..3d7ba1c 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java
@@ -49,6 +49,9 @@ public class Optimizer {
if (HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVEOPTCP)) {
transformations.add(new ColumnPruner());
}
+ if (HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVEOPTGBYUSINGINDEX)) {
+ transformations.add(new RewriteGBUsingIndex());
+ }
if (HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVEOPTPPD)) {
transformations.add(new PredicatePushDown());
transformations.add(new PartitionPruner());
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/RewriteCanApplyCtx.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/RewriteCanApplyCtx.java
new file mode 100644
index 0000000..6bf830c
--- /dev/null
+++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/RewriteCanApplyCtx.java
@@ -0,0 +1,388 @@
+package org.apache.hadoop.hive.ql.optimizer;
+
+import java.io.Serializable;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.LinkedHashMap;
+import java.util.LinkedHashSet;
+import java.util.Map;
+import java.util.Set;
+import java.util.Stack;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hive.conf.HiveConf;
+import org.apache.hadoop.hive.metastore.api.Index;
+import org.apache.hadoop.hive.ql.exec.Operator;
+import org.apache.hadoop.hive.ql.lib.DefaultGraphWalker;
+import org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher;
+import org.apache.hadoop.hive.ql.lib.Dispatcher;
+import org.apache.hadoop.hive.ql.lib.GraphWalker;
+import org.apache.hadoop.hive.ql.lib.Node;
+import org.apache.hadoop.hive.ql.lib.NodeProcessor;
+import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx;
+import org.apache.hadoop.hive.ql.lib.PreOrderWalker;
+import org.apache.hadoop.hive.ql.lib.Rule;
+import org.apache.hadoop.hive.ql.lib.RuleRegExp;
+import org.apache.hadoop.hive.ql.parse.ParseContext;
+import org.apache.hadoop.hive.ql.parse.SemanticException;
+
+/**
+ * RewriteCanApplyCtx class stores the context for the {@link RewriteCanApplyProcFactory} to determine
+ * if any index can be used and if the input query meets all the criteria for rewrite optimization.
+ */
+public final class RewriteCanApplyCtx implements NodeProcessorCtx {
+
+ protected final Log LOG = LogFactory.getLog(RewriteCanApplyCtx.class.getName());
+
+ private RewriteCanApplyCtx(ParseContext parseContext, HiveConf conf) {
+ this.parseContext = parseContext;
+ this.hiveConf = conf;
+ initRewriteVars();
+ }
+
+ public static RewriteCanApplyCtx getInstance(ParseContext parseContext, HiveConf conf){
+ return new RewriteCanApplyCtx(parseContext, conf);
+ }
+
+ public static enum RewriteVars {
+ AGG_FUNC_CNT("hive.ql.rewrites.agg.func.cnt", 0),
+ GBY_KEY_CNT("hive.ql.rewrites.gby.key.cnt", 0),
+ QUERY_HAS_SORT_BY("hive.ql.rewrites.query.has.sort.by", false),
+ QUERY_HAS_ORDER_BY("hive.ql.rewrites.query.has.order.by", false),
+ QUERY_HAS_DISTRIBUTE_BY("hive.ql.rewrites.query.has.distribute.by", false),
+ QUERY_HAS_GROUP_BY("hive.ql.rewrites.query.has.group.by", false),
+ QUERY_HAS_DISTINCT("hive.ql.rewrites.query.has.distinct", false), //This still uses QBParseInfo to make decision. Needs to be changed if QB dependency is not desired.
+ AGG_FUNC_IS_NOT_COUNT("hive.ql.rewrites.agg.func.is.not.count", false),
+ AGG_FUNC_COLS_FETCH_EXCEPTION("hive.ql.rewrites.agg.func.cols.fetch.exception", false),
+ WHR_CLAUSE_COLS_FETCH_EXCEPTION("hive.ql.rewrites.whr.clause.cols.fetch.exception", false),
+ SEL_CLAUSE_COLS_FETCH_EXCEPTION("hive.ql.rewrites.sel.clause.cols.fetch.exception", false),
+ GBY_KEYS_FETCH_EXCEPTION("hive.ql.rewrites.gby.keys.fetch.exception", false),
+ COUNT_ON_ALL_COLS("hive.ql.rewrites.count.on.all.cols", false),
+ QUERY_HAS_GENERICUDF_ON_GROUPBY_KEY("hive.ql.rewrites.query.has.genericudf.on.groupby.key", false),
+ QUERY_HAS_MULTIPLE_TABLES("hive.ql.rewrites.query.has.multiple.tables", false),
+ SHOULD_APPEND_SUBQUERY("hive.ql.rewrites.should.append.subquery", false),
+ REMOVE_GROUP_BY("hive.ql.rewrites.remove.group.by", false);
+ ;
+
+ public final String varname;
+ public final int defaultIntVal;
+ public final boolean defaultBoolVal;
+ public final Class> valClass;
+
+ //Constructors for int and boolean values
+ RewriteVars(String varname, int defaultIntVal) {
+ this.varname = varname;
+ this.valClass = Integer.class;
+ this.defaultIntVal = defaultIntVal;
+ this.defaultBoolVal = false;
+ }
+
+ RewriteVars(String varname, boolean defaultBoolVal) {
+ this.varname = varname;
+ this.valClass = Boolean.class;
+ this.defaultIntVal = -1;
+ this.defaultBoolVal = defaultBoolVal;
+ }
+
+ @Override
+ public String toString() {
+ return varname;
+ }
+
+
+
+ }
+
+ /*
+ * Methods to set and retrieve the RewriteVars enum variables
+ * */
+ public int getIntVar(Configuration conf, RewriteVars var) {
+ assert (var.valClass == Integer.class);
+ return conf.getInt(var.varname, var.defaultIntVal);
+ }
+
+ public void setIntVar(Configuration conf, RewriteVars var, int val) {
+ assert (var.valClass == Integer.class);
+ conf.setInt(var.varname, val);
+ }
+
+ public boolean getBoolVar(Configuration conf, RewriteVars var) {
+ assert (var.valClass == Boolean.class);
+ return conf.getBoolean(var.varname, var.defaultBoolVal);
+ }
+
+ public void setBoolVar(Configuration conf, RewriteVars var, boolean val) {
+ assert (var.valClass == Boolean.class);
+ conf.setBoolean(var.varname, val);
+ }
+
+ public void initRewriteVars(){
+ setIntVar(hiveConf, RewriteVars.AGG_FUNC_CNT,0);
+ setIntVar(hiveConf, RewriteVars.GBY_KEY_CNT,0);
+ setBoolVar(hiveConf, RewriteVars.QUERY_HAS_SORT_BY, false);
+ setBoolVar(hiveConf, RewriteVars.QUERY_HAS_ORDER_BY, false);
+ setBoolVar(hiveConf, RewriteVars.QUERY_HAS_DISTRIBUTE_BY, false);
+ setBoolVar(hiveConf, RewriteVars.QUERY_HAS_GROUP_BY, false);
+ setBoolVar(hiveConf, RewriteVars.QUERY_HAS_DISTINCT, false);
+ setBoolVar(hiveConf, RewriteVars.AGG_FUNC_IS_NOT_COUNT, false);
+ setBoolVar(hiveConf, RewriteVars.AGG_FUNC_COLS_FETCH_EXCEPTION, false);
+ setBoolVar(hiveConf, RewriteVars.WHR_CLAUSE_COLS_FETCH_EXCEPTION, false);
+ setBoolVar(hiveConf, RewriteVars.SEL_CLAUSE_COLS_FETCH_EXCEPTION, false);
+ setBoolVar(hiveConf, RewriteVars.GBY_KEYS_FETCH_EXCEPTION, false);
+ setBoolVar(hiveConf, RewriteVars.COUNT_ON_ALL_COLS, false);
+ setBoolVar(hiveConf, RewriteVars.QUERY_HAS_GENERICUDF_ON_GROUPBY_KEY, false);
+ setBoolVar(hiveConf, RewriteVars.QUERY_HAS_MULTIPLE_TABLES, false);
+ setBoolVar(hiveConf, RewriteVars.SHOULD_APPEND_SUBQUERY, false);
+ setBoolVar(hiveConf, RewriteVars.REMOVE_GROUP_BY, false);
+ }
+
+
+
+
+ //Data structures that are populated in the RewriteCanApplyProcFactory methods to check if the index key meets all criteria
+ Set selectColumnsList = new LinkedHashSet();
+ Set predicateColumnsList = new LinkedHashSet();
+ Set gbKeyNameList = new LinkedHashSet();
+ Set aggFuncColList = new LinkedHashSet();
+
+ private final HiveConf hiveConf;
+ private int aggFuncCnt = 0;
+ private final ParseContext parseContext;
+ private String baseTableName = "";
+
+ void resetCanApplyCtx(){
+ aggFuncCnt = 0;
+ selectColumnsList.clear();
+ predicateColumnsList.clear();
+ gbKeyNameList.clear();
+ aggFuncColList.clear();
+ baseTableName = "";
+ }
+
+ public Set getSelectColumnsList() {
+ return selectColumnsList;
+ }
+
+ public void setSelectColumnsList(Set selectColumnsList) {
+ this.selectColumnsList = selectColumnsList;
+ }
+
+ public Set getPredicateColumnsList() {
+ return predicateColumnsList;
+ }
+
+ public void setPredicateColumnsList(Set predicateColumnsList) {
+ this.predicateColumnsList = predicateColumnsList;
+ }
+
+ public Set getGbKeyNameList() {
+ return gbKeyNameList;
+ }
+
+ public void setGbKeyNameList(Set gbKeyNameList) {
+ this.gbKeyNameList = gbKeyNameList;
+ }
+
+ public Set getAggFuncColList() {
+ return aggFuncColList;
+ }
+
+ public void setAggFuncColList(Set aggFuncColList) {
+ this.aggFuncColList = aggFuncColList;
+ }
+
+ public HiveConf getConf() {
+ return hiveConf;
+ }
+
+ public int getAggFuncCnt() {
+ return aggFuncCnt;
+ }
+
+ public void setAggFuncCnt(int aggFuncCnt) {
+ this.aggFuncCnt = aggFuncCnt;
+ }
+
+ public String getBaseTableName() {
+ return baseTableName;
+ }
+
+ public void setBaseTableName(String baseTableName) {
+ this.baseTableName = baseTableName;
+ }
+
+ public ParseContext getParseContext() {
+ return parseContext;
+ }
+
+
+ /**
+ * This method walks all the nodes starting from topOp TableScanOperator node
+ * and invokes methods from {@link RewriteCanApplyProcFactory} for each of the rules
+ * added to the opRules map. We use the {@link DefaultGraphWalker} for a post-order
+ * traversal of the operator tree.
+ *
+ * The methods from {@link RewriteCanApplyProcFactory} set appropriate values in
+ * {@link RewriteVars} enum.
+ *
+ * @param topOp
+ */
+ void populateRewriteVars(Operator extends Serializable> topOp){
+ Map opRules = new LinkedHashMap();
+ opRules.put(new RuleRegExp("R1", "FIL%"), RewriteCanApplyProcFactory.canApplyOnFilterOperator());
+ opRules.put(new RuleRegExp("R2", "GBY%"), RewriteCanApplyProcFactory.canApplyOnGroupByOperator());
+ opRules.put(new RuleRegExp("R3", "RS%OP%"), RewriteCanApplyProcFactory.canApplyOnExtractOperator());
+ opRules.put(new RuleRegExp("R4", "SEL%"), RewriteCanApplyProcFactory.canApplyOnSelectOperator());
+
+ // The dispatcher fires the processor corresponding to the closest matching
+ // rule and passes the context along
+ Dispatcher disp = new DefaultRuleDispatcher(getDefaultProc(), opRules, this);
+ GraphWalker ogw = new PreOrderWalker(disp);
+
+ // Create a list of topop nodes
+ ArrayList topNodes = new ArrayList();
+ topNodes.add(topOp);
+
+ try {
+ ogw.startWalking(topNodes, null);
+ } catch (SemanticException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+
+ }
+
+
+ /**
+ * Default procedure for {@link DefaultRuleDispatcher}
+ * @return
+ */
+ private NodeProcessor getDefaultProc() {
+ return new NodeProcessor() {
+ @Override
+ public Object process(Node nd, Stack stack,
+ NodeProcessorCtx procCtx, Object... nodeOutputs) throws SemanticException {
+ return null;
+ }
+ };
+ }
+
+
+ //Map for base table to index table mapping
+ //TableScan operator for base table will be modified to read from index table
+ private final HashMap baseToIdxTableMap = new HashMap();;
+
+
+ public void addTable(String baseTableName, String indexTableName) {
+ baseToIdxTableMap.put(baseTableName, indexTableName);
+ }
+
+ public String findBaseTable(String baseTableName) {
+ return baseToIdxTableMap.get(baseTableName);
+ }
+
+
+ boolean isIndexUsableForQueryBranchRewrite(Index index, Set indexKeyNames){
+ boolean removeGroupBy = true;
+ boolean optimizeCount = false;
+
+ //--------------------------------------------
+ //Check if all columns in select list are part of index key columns
+ if (!indexKeyNames.containsAll(selectColumnsList)) {
+ LOG.info("Select list has non index key column : " +
+ " Cannot use index " + index.getIndexName());
+ return false;
+ }
+
+ //--------------------------------------------
+ // Check if all columns in where predicate are part of index key columns
+ // TODO: Currently we allow all predicates , would it be more efficient
+ // (or at least not worse) to read from index_table and not from baseTable?
+ if (!indexKeyNames.containsAll(predicateColumnsList)) {
+ LOG.info("Predicate column ref list has non index key column : " +
+ " Cannot use index " + index.getIndexName());
+ return false;
+ }
+
+ //--------------------------------------------
+ // For group by, we need to check if all keys are from index columns
+ // itself. Here GB key order can be different than index columns but that does
+ // not really matter for final result.
+ // E.g. select c1, c2 from src group by c2, c1;
+ // we can rewrite this one to:
+ // select c1, c2 from src_cmpt_idx;
+ if (!indexKeyNames.containsAll(gbKeyNameList)) {
+ LOG.info("Group by key has some non-indexed columns, " +
+ " Cannot use index " + index.getIndexName());
+ return false;
+ }
+
+ // FUTURE: See if this can be relaxed.
+ // If we have agg function (currently only COUNT is supported), check if its input are
+ // from index. we currently support only that.
+ if (aggFuncColList.size() > 0) {
+ if (indexKeyNames.containsAll(aggFuncColList) == false) {
+ LOG.info("Agg Func input is not present in index key columns. Currently " +
+ "only agg func on index columns are supported by rewrite optimization" );
+ return false;
+ }
+ // If we have count on some key, check if key is same as index key,
+ if (aggFuncColList.containsAll(indexKeyNames)) {
+ optimizeCount = true;
+ }
+ }
+
+ if (!gbKeyNameList.containsAll(indexKeyNames)) {
+ // GB key and idx key are not same, don't remove GroupBy, but still do index scan
+ LOG.info("Index has some non-groupby columns, GroupBy will be"
+ + " preserved by rewrite optimization but original table scan"
+ + " will be replaced with index table scan." );
+ removeGroupBy = false;
+ }
+
+ // This check prevents to remove GroupBy for cases where the GROUP BY key cols are
+ // not simple expressions i.e. simple index key cols (in any order), but some
+ // expressions on the the key cols.
+ // e.g.
+ // 1. GROUP BY key, f(key)
+ // FUTURE: If f(key) output is functionally dependent on key, then we should support
+ // it. However we don't have mechanism/info about f() yet to decide that.
+ // 2. GROUP BY idxKey, 1
+ // FUTURE: GB Key has literals along with idxKeyCols. Develop a rewrite to eliminate the
+ // literals from GB key.
+ // 3. GROUP BY idxKey, idxKey
+ // FUTURE: GB Key has dup idxKeyCols. Develop a rewrite to eliminate the dup key cols
+ // from GB key.
+ if (getBoolVar(hiveConf, RewriteVars.QUERY_HAS_GROUP_BY) &&
+ indexKeyNames.size() < getIntVar(hiveConf, RewriteVars.GBY_KEY_CNT)) {
+ LOG.info("Group by key has some non-indexed columns, GroupBy will be"
+ + " preserved by rewrite optimization" );
+ removeGroupBy = false;
+ }
+
+
+ //Now that we are good to do this optimization, set parameters in context
+ //which would be used by transformation procedure as inputs.
+
+ //sub-query is needed only in case of optimizecount and complex gb keys?
+ if(getBoolVar(hiveConf, RewriteVars.QUERY_HAS_GENERICUDF_ON_GROUPBY_KEY) == false
+ && !(optimizeCount == true && removeGroupBy == false) ) {
+ setBoolVar(hiveConf, RewriteVars.REMOVE_GROUP_BY, removeGroupBy);
+ addTable(baseTableName, index.getIndexTableName());
+ }else if(getBoolVar(hiveConf, RewriteVars.QUERY_HAS_GENERICUDF_ON_GROUPBY_KEY) == true &&
+ getIntVar(hiveConf, RewriteVars.AGG_FUNC_CNT) == 1 &&
+ getBoolVar(hiveConf, RewriteVars.AGG_FUNC_IS_NOT_COUNT) == false){
+ setBoolVar(hiveConf, RewriteVars.SHOULD_APPEND_SUBQUERY, true);
+ addTable(baseTableName, index.getIndexTableName());
+ }else{
+ LOG.info("No valid criteria met to apply rewrite." );
+ return false;
+ }
+
+ return true;
+ }
+
+
+}
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/RewriteCanApplyProcFactory.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/RewriteCanApplyProcFactory.java
new file mode 100644
index 0000000..ff8d90f
--- /dev/null
+++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/RewriteCanApplyProcFactory.java
@@ -0,0 +1,308 @@
+package org.apache.hadoop.hive.ql.optimizer;
+
+import java.io.Serializable;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.Stack;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.hive.ql.exec.ColumnInfo;
+import org.apache.hadoop.hive.ql.exec.ExtractOperator;
+import org.apache.hadoop.hive.ql.exec.FileSinkOperator;
+import org.apache.hadoop.hive.ql.exec.FilterOperator;
+import org.apache.hadoop.hive.ql.exec.GroupByOperator;
+import org.apache.hadoop.hive.ql.exec.Operator;
+import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator;
+import org.apache.hadoop.hive.ql.exec.RowSchema;
+import org.apache.hadoop.hive.ql.exec.SelectOperator;
+import org.apache.hadoop.hive.ql.lib.Node;
+import org.apache.hadoop.hive.ql.lib.NodeProcessor;
+import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx;
+import org.apache.hadoop.hive.ql.optimizer.RewriteCanApplyCtx.RewriteVars;
+import org.apache.hadoop.hive.ql.parse.ASTNode;
+import org.apache.hadoop.hive.ql.parse.HiveParser;
+import org.apache.hadoop.hive.ql.parse.QBParseInfo;
+import org.apache.hadoop.hive.ql.parse.SemanticException;
+import org.apache.hadoop.hive.ql.plan.AggregationDesc;
+import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc;
+import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
+import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc;
+import org.apache.hadoop.hive.ql.plan.FilterDesc;
+import org.apache.hadoop.hive.ql.plan.GroupByDesc;
+import org.apache.hadoop.hive.ql.plan.ReduceSinkDesc;
+
+/**
+ * Factory of methods used by {@link RewriteGBUsingIndex} (see checkEachDAGOperator(..) method)
+ * to determine if the rewrite optimization can be applied to the input query
+ *
+ */
+public final class RewriteCanApplyProcFactory {
+ protected final static Log LOG = LogFactory.getLog(RewriteCanApplyProcFactory.class.getName());
+ private static RewriteCanApplyCtx canApplyCtx = null;
+
+ private RewriteCanApplyProcFactory(){
+ //this prevents the class from getting instantiated
+ }
+
+
+ /**
+ * Check for conditions in FilterOperator that do not meet rewrite criteria.
+ * Set the appropriate variables in {@link RewriteVars} enum.
+ */
+ private static class CheckFilterProc implements NodeProcessor {
+ public Object process(Node nd, Stack stack, NodeProcessorCtx ctx,
+ Object... nodeOutputs) throws SemanticException {
+ FilterOperator operator = (FilterOperator)nd;
+ canApplyCtx = (RewriteCanApplyCtx)ctx;
+ FilterDesc conf = (FilterDesc)operator.getConf();
+ //The filter operator should have a predicate of ExprNodeGenericFuncDesc type.
+ //This represents the comparison operator
+ ExprNodeGenericFuncDesc oldengfd = (ExprNodeGenericFuncDesc) conf.getPredicate();
+ if(oldengfd == null){
+ canApplyCtx.setBoolVar(canApplyCtx.getConf(), RewriteVars.WHR_CLAUSE_COLS_FETCH_EXCEPTION, true);
+ //return false;
+ }
+ //The predicate should have valid left and right columns
+ List colList = oldengfd.getCols();
+ if(colList == null || colList.size() == 0){
+ canApplyCtx.setBoolVar(canApplyCtx.getConf(), RewriteVars.WHR_CLAUSE_COLS_FETCH_EXCEPTION, true);
+ //return false;
+ }
+ //Add the predicate columns to RewriteCanApplyCtx's predColRefs list to check later
+ //if index keys contain all filter predicate columns and vice-a-versa
+ for (String col : colList) {
+ canApplyCtx.getPredicateColumnsList().add(col);
+ }
+
+ return null;
+ }
+ }
+
+ public static CheckFilterProc canApplyOnFilterOperator() {
+ return new CheckFilterProc();
+ }
+
+
+
+ /**
+ * Check for conditions in GroupByOperator that do not meet rewrite criteria.
+ * Set the appropriate variables in {@link RewriteVars} enum.
+ *
+ */
+ private static class CheckGroupByProc implements NodeProcessor {
+ public Object process(Node nd, Stack stack, NodeProcessorCtx ctx,
+ Object... nodeOutputs) throws SemanticException {
+ GroupByOperator operator = (GroupByOperator)nd;
+ canApplyCtx = (RewriteCanApplyCtx)ctx;
+ //for each group-by clause in query, only one GroupByOperator of the GBY-RS-GBY sequence is stored in getGroupOpToInputTables
+ //we need to process only this operator
+ //Also, we do not rewrite for cases when same query branch has multiple group-by constructs
+ if(canApplyCtx.getParseContext().getGroupOpToInputTables().containsKey(operator) &&
+ canApplyCtx.getBoolVar(canApplyCtx.getConf(), RewriteVars.QUERY_HAS_GROUP_BY) == false ){
+
+ canApplyCtx.setBoolVar(canApplyCtx.getConf(), RewriteVars.QUERY_HAS_GROUP_BY, true);
+
+ GroupByDesc conf = (GroupByDesc) operator.getConf();
+ ArrayList aggrList = conf.getAggregators();
+ if(aggrList != null && aggrList.size() > 0){
+ for (AggregationDesc aggregationDesc : aggrList) {
+ int aggCnt = canApplyCtx.getAggFuncCnt();
+ canApplyCtx.setIntVar(canApplyCtx.getConf(), RewriteVars.AGG_FUNC_CNT, aggCnt + 1);
+ canApplyCtx.setAggFuncCnt(aggCnt + 1);
+
+ //In the current implementation, we do not support more than 1 agg funcs in group-by
+ if(canApplyCtx.getIntVar(canApplyCtx.getConf(), RewriteVars.AGG_FUNC_CNT) > 1) {
+ return false;
+ }
+ String aggFunc = aggregationDesc.getGenericUDAFName();
+ if(!aggFunc.equals("count")){
+ canApplyCtx.setBoolVar(canApplyCtx.getConf(), RewriteVars.AGG_FUNC_IS_NOT_COUNT, true);
+ //return false;
+ }else{
+ ArrayList para = aggregationDesc.getParameters();
+ //for a valid aggregation, it needs to have non-null parameter list
+ if(para == null){
+ canApplyCtx.setBoolVar(canApplyCtx.getConf(), RewriteVars.AGG_FUNC_COLS_FETCH_EXCEPTION, true);
+ //return false;
+ }else if(para.size() == 0){
+ //count(*) case
+ canApplyCtx.setBoolVar(canApplyCtx.getConf(), RewriteVars.COUNT_ON_ALL_COLS, true);
+ //return false;
+ }else{
+ for(int i=0; i< para.size(); i++){
+ ExprNodeDesc end = para.get(i);
+ if(end instanceof ExprNodeColumnDesc){
+ //Add the columns to RewriteCanApplyCtx's selectColumnsList list to check later
+ //if index keys contain all select clause columns and vice-a-versa
+ //we get the select column 'actual' names only here if we have a agg func along with groub-by
+ //SelectOperator has internal names in its colList data structure
+ canApplyCtx.getSelectColumnsList().add(((ExprNodeColumnDesc) end).getColumn());
+
+ //Add the columns to RewriteCanApplyCtx's aggFuncColList list to check later
+ //if columns contained in agg func are index key columns
+ canApplyCtx.getAggFuncColList().add(((ExprNodeColumnDesc) end).getColumn());
+ }
+ }
+ }
+ }
+ }
+ }else{
+ //if group-by does not have aggregation list, then it "might" be a DISTINCT case
+ //this code uses query block to determine if the ASTNode tree contains the distinct TOK_SELECTDI token
+ QBParseInfo qbParseInfo = canApplyCtx.getParseContext().getQB().getParseInfo();
+ Set clauseNameSet = qbParseInfo.getClauseNames();
+ if (clauseNameSet.size() == 1) {
+ Iterator clauseNameIter = clauseNameSet.iterator();
+ String clauseName = clauseNameIter.next();
+ ASTNode rootSelExpr = qbParseInfo.getSelForClause(clauseName);
+ boolean isDistinct = (rootSelExpr.getType() == HiveParser.TOK_SELECTDI);
+ if(isDistinct) {
+ canApplyCtx.setBoolVar(canApplyCtx.getConf(), RewriteVars.QUERY_HAS_DISTINCT, true);
+ }
+ }
+ }
+
+
+ //we need to have non-null groub-by keys for a valid groub-by operator
+ ArrayList keyList = conf.getKeys();
+ if(keyList == null || keyList.size() == 0){
+ canApplyCtx.setBoolVar(canApplyCtx.getConf(), RewriteVars.GBY_KEYS_FETCH_EXCEPTION, true);
+ //return false;
+ }
+
+ //sets the no. of keys in groub by to be used later to determine is group-by has non-index cols
+ //group-by needs to be preserved in such cases (eg.group-by using a function on index key. This is the subquery append case)
+ canApplyCtx.setIntVar(canApplyCtx.getConf(), RewriteVars.GBY_KEY_CNT, keyList.size());
+ for (ExprNodeDesc exprNodeDesc : keyList) {
+ if(exprNodeDesc instanceof ExprNodeColumnDesc){
+ //Add the group-by keys to RewriteCanApplyCtx's gbKeyNameList list to check later
+ //if all keys are from index columns
+ canApplyCtx.getGbKeyNameList().addAll(exprNodeDesc.getCols());
+ }else if(exprNodeDesc instanceof ExprNodeGenericFuncDesc){
+ ExprNodeGenericFuncDesc endfg = (ExprNodeGenericFuncDesc)exprNodeDesc;
+ List childExprs = endfg.getChildExprs();
+ for (ExprNodeDesc end : childExprs) {
+ if(end instanceof ExprNodeColumnDesc){
+ //Set QUERY_HAS_GENERICUDF_ON_GROUPBY_KEY to true which is used later to determine
+ //whether the rewrite is a 'append subquery' case
+ //this is true in case the group-by key is a GenericUDF like year,month etc
+ canApplyCtx.setBoolVar(canApplyCtx.getConf(), RewriteVars.QUERY_HAS_GENERICUDF_ON_GROUPBY_KEY, true);
+ canApplyCtx.getGbKeyNameList().addAll(exprNodeDesc.getCols());
+ canApplyCtx.getSelectColumnsList().add(((ExprNodeColumnDesc) end).getColumn());
+ }
+ }
+ }
+ }
+
+ }
+
+ return null;
+ }
+ }
+
+ public static CheckGroupByProc canApplyOnGroupByOperator() {
+ return new CheckGroupByProc();
+ }
+
+
+ /**
+ * Check for conditions in ExtractOperator that do not meet rewrite criteria.
+ * Set the appropriate variables in {@link RewriteVars} enum.
+ *
+ */
+ private static class CheckExtractProc implements NodeProcessor {
+ public Object process(Node nd, Stack stack, NodeProcessorCtx ctx,
+ Object... nodeOutputs) throws SemanticException {
+ ExtractOperator operator = (ExtractOperator)nd;
+ canApplyCtx = (RewriteCanApplyCtx)ctx;
+ //We get the information whether query has SORT BY, ORDER BY, DISTRIBUTE BY from
+ //the parent ReduceSinkOperator of the current ExtractOperator
+ if(operator.getParentOperators() != null && operator.getParentOperators().size() >0){
+ Operator extends Serializable> interim = operator.getParentOperators().get(0);
+ if(interim instanceof ReduceSinkOperator){
+ ReduceSinkDesc conf = (ReduceSinkDesc) interim.getConf();
+ ArrayList partCols = conf.getPartitionCols();
+ int nr = conf.getNumReducers();
+ if(nr == -1){
+ if(partCols != null && partCols.size() > 0){
+ //query has distribute-by is there are non-zero partition columns
+ canApplyCtx.setBoolVar(canApplyCtx.getConf(), RewriteVars.QUERY_HAS_DISTRIBUTE_BY, true);
+ //return false;
+ }else{
+ //we do not need partition columns in case of sort-by
+ canApplyCtx.setBoolVar(canApplyCtx.getConf(), RewriteVars.QUERY_HAS_SORT_BY, true);
+ //return false;
+ }
+ }else if(nr == 1){
+ //Query has order-by only if number of reducers is 1
+ canApplyCtx.setBoolVar(canApplyCtx.getConf(), RewriteVars.QUERY_HAS_ORDER_BY, true);
+ //return false;
+ }
+
+ }
+ }
+
+ return null;
+ }
+ }
+
+ public static CheckExtractProc canApplyOnExtractOperator() {
+ return new CheckExtractProc();
+ }
+
+ /**
+ * Check for conditions in SelectOperator that do not meet rewrite criteria.
+ * Set the appropriate variables in {@link RewriteVars} enum.
+ *
+ */
+ private static class CheckSelectProc implements NodeProcessor {
+ public Object process(Node nd, Stack stack, NodeProcessorCtx ctx,
+ Object... nodeOutputs) throws SemanticException {
+ SelectOperator operator = (SelectOperator)nd;
+ canApplyCtx = (RewriteCanApplyCtx)ctx;
+
+ List> childrenList = operator.getChildOperators();
+ Operator extends Serializable> child = childrenList.get(0);
+ if(child instanceof FileSinkOperator){
+ Map internalToAlias = new LinkedHashMap();
+ RowSchema rs = operator.getSchema();
+ //to get the internal to alias mapping
+ ArrayList sign = rs.getSignature();
+ for (ColumnInfo columnInfo : sign) {
+ internalToAlias.put(columnInfo.getInternalName(), columnInfo.getAlias());
+ //Add the columns to RewriteCanApplyCtx's selectColumnsList list to check later
+ //if index keys contain all select clause columns and vice-a-versa
+/* if(!columnInfo.getAlias().startsWith("_c")){
+ canApplyCtx.getSelectColumnsList().add(columnInfo.getAlias());
+ }
+*/ }
+
+ //if FilterOperator predicate has internal column names, we need to retrieve the 'actual' column names to
+ //check if index keys contain all filter predicate columns and vice-a-versa
+ Iterator predItr = canApplyCtx.getPredicateColumnsList().iterator();
+ while(predItr.hasNext()){
+ String predCol = predItr.next();
+ String newPredCol = "";
+ if(predCol.startsWith("_c") && internalToAlias.get(predCol) != null){
+ newPredCol = internalToAlias.get(predCol);
+ canApplyCtx.getPredicateColumnsList().remove(predCol);
+ canApplyCtx.getPredicateColumnsList().add(newPredCol);
+ }
+ }
+ }
+ return null;
+ }
+ }
+
+ public static CheckSelectProc canApplyOnSelectOperator() {
+ return new CheckSelectProc();
+ }
+
+
+
+}
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/RewriteGBUsingIndex.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/RewriteGBUsingIndex.java
new file mode 100644
index 0000000..b8e7b5b
--- /dev/null
+++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/RewriteGBUsingIndex.java
@@ -0,0 +1,509 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.optimizer;
+
+import java.io.Serializable;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.LinkedHashMap;
+import java.util.LinkedHashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.hive.conf.HiveConf;
+import org.apache.hadoop.hive.metastore.api.FieldSchema;
+import org.apache.hadoop.hive.metastore.api.Index;
+import org.apache.hadoop.hive.metastore.api.StorageDescriptor;
+import org.apache.hadoop.hive.ql.exec.Operator;
+import org.apache.hadoop.hive.ql.exec.TableScanOperator;
+import org.apache.hadoop.hive.ql.metadata.Hive;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.ql.metadata.Table;
+import org.apache.hadoop.hive.ql.optimizer.RewriteCanApplyCtx.RewriteVars;
+import org.apache.hadoop.hive.ql.parse.ParseContext;
+import org.apache.hadoop.hive.ql.parse.QBParseInfo;
+import org.apache.hadoop.hive.ql.parse.SemanticException;
+
+
+/**
+ * RewriteGBUsingIndex is implemented as one of the Rule-based Optimizations.
+ * Implements optimizations for GroupBy clause rewrite using compact index.
+ * This optimization rewrites GroupBy query over base table to the query over simple table-scan over
+ * index table, if there is index on the group by key(s) or the distinct column(s).
+ * E.g.
+ *
+ * select key
+ * from table
+ * group by key;
+ *
+ * to
+ *
+ * select key
+ * from idx_table;
+ *
+ *
+ * The rewrite supports following queries
+ * - Queries having only those col refs that are in the index key.
+ * - Queries that have index key col refs
+ * - in SELECT
+ * - in WHERE
+ * - in GROUP BY
+ * - Queries with agg func COUNT(literal) or COUNT(index key col ref)
+ * in SELECT
+ * - Queries with SELECT DISTINCT index_key_col_refs
+ * - Queries having a subquery satisfying above condition (only the
+ * subquery is rewritten)
+ *
+ * FUTURE:
+ * - Many of the checks for above criteria rely on equivalence of expressions,
+ * but such framework/mechanism of expression equivalence isn't present currently or developed yet.
+ * This needs to be supported in order for better robust checks. This is critically important for
+ * correctness of a query rewrite system.
+ * - This code currently uses index types with org.apache.hadoop.hive.ql.index.compact.CompactIndexHandler.
+ * However, the CompactIndexHandler currently stores the distinct block offsets and not the row offsets.
+ * Use of this index type will give erroneous results to compute COUNT if the same key appears more
+ * than once within the same block. To address this issue, we plan to create a new index type in future.
+ *
+ *
+ * @see RewriteCanApplyCtx
+ * @see RewriteCanApplyProcFactory
+ * @see RewriteRemoveGroupbyCtx
+ * @see RewriteRemoveGroupbyProcFactory
+ * @see RewriteIndexSubqueryCtx
+ * @see RewriteIndexSubqueryProcFactory
+ * @see RewriteParseContextGenerator
+ *
+ */
+public class RewriteGBUsingIndex implements Transform {
+ private ParseContext parseContext;
+ private Hive hiveDb;
+ private HiveConf hiveConf;
+ protected final Log LOG = LogFactory.getLog(this.getClass().getName());
+
+ //Stores the list of top TableScanOperator names for which the rewrite can be applied and the action that needs to be performed for operator tree
+ //starting from this TableScanOperator
+ private final Map tsOpToProcess = new LinkedHashMap();
+
+ //Name of the current table on which rewrite is being performed
+ private String baseTableName = null;
+ private String indexTableName = null;
+
+ /***************************************Index Validation Variables***************************************/
+ //The SUPPORTED_INDEX_TYPE value will change when we implement a new index handler to retrieve correct result
+ // for count if the same key appears more than once within the same block
+ final String SUPPORTED_INDEX_TYPE =
+ "org.apache.hadoop.hive.ql.index.compact.CompactIndexHandler";
+ final String COMPACT_IDX_BUCKET_COL = "_bucketname";
+ final String COMPACT_IDX_OFFSETS_ARRAY_COL = "_offsets";
+
+ @Override
+ public ParseContext transform(ParseContext pctx) throws SemanticException {
+ parseContext = pctx;
+ hiveConf = parseContext.getConf();
+ try {
+ hiveDb = Hive.get(hiveConf);
+ } catch (HiveException e) {
+ LOG.info("Exception in getting hive conf");
+ e.printStackTrace();
+ }
+
+
+ /* Check if the input query is internal query that inserts in table (eg. ALTER INDEX...REBUILD etc.)
+ * We do not apply optimization here.
+ * */
+ if(isQueryInsertToTable()){
+ return parseContext;
+ }else{
+ /* Check if the input query passes all the tests to be eligible for a rewrite
+ * If yes, rewrite original query; else, return the current parseContext
+ * */
+ if(shouldApplyOptimization()){
+ LOG.debug("Rewriting Original Query.");
+ rewriteOriginalQuery();
+ }
+ return parseContext;
+ }
+
+ }
+
+ /**
+ * Use Query block's parse {@link QBParseInfo} information to check if the input query
+ * is an internal SQL.
+ * If it is true, we do not apply this optimization.
+ * @return
+ */
+ private boolean isQueryInsertToTable(){
+ QBParseInfo qbParseInfo = parseContext.getQB().getParseInfo();
+ return qbParseInfo.isInsertToTable();
+ }
+
+ /**
+ * We traverse the current operator tree to check for conditions in which the
+ * optimization cannot be applied.
+ *
+ * At the end, we check if all conditions have passed for rewrite. If yes, we
+ * determine if the the index is usable for rewrite. Else, we log the condition which
+ * did not meet the rewrite criterion.
+ *
+ * @return
+ */
+ boolean shouldApplyOptimization(){
+ boolean canApply = false;
+ if(ifQueryHasMultipleTables()){
+ //We do not apply this optimization for this case as of now.
+ return false;
+ }else{
+ /*
+ * This code iterates over each TableScanOperator from the topOps map from ParseContext.
+ * For each operator tree originating from this top TableScanOperator, we determine
+ * if the optimization can be applied. If yes, we add the name of the top table to
+ * the tsOpToProcess to apply rewrite later on.
+ * */
+ HashMap topToTable = parseContext.getTopToTable();
+ HashMap> topOps = parseContext.getTopOps();
+ Iterator topOpItr = topToTable.keySet().iterator();
+ while(topOpItr.hasNext()){
+ //Context for checking if this optimization can be applied to the input query
+ RewriteCanApplyCtx canApplyCtx = RewriteCanApplyCtx.getInstance(parseContext, hiveConf);
+
+ TableScanOperator topOp = topOpItr.next();
+ Table table = topToTable.get(topOp);
+ baseTableName = table.getTableName();
+ HashMap> indexTableMap = getIndexTableInfoForRewrite(topOp);
+
+ if(indexTableMap != null){
+ if(indexTableMap.size() == 0){
+ LOG.info("No Valid Index Found to apply Rewrite, " +
+ "skipping " + getName() + " optimization" );
+ } else if(indexTableMap.size() > 1){
+ LOG.info("Table has multiple valid index tables to apply rewrite.");
+ }else{
+ canApplyCtx.setBaseTableName(baseTableName);
+ canApplyCtx.populateRewriteVars(topOp);
+
+ Iterator indexMapItr = indexTableMap.keySet().iterator();
+ Index index = null;
+ while(indexMapItr.hasNext()){
+ //we rewrite the original query using the first valid index encountered
+ //this can be changed if we have a better mechanism to decide which index will produce a better rewrite
+ index = indexMapItr.next();
+ canApply = canApplyCtx.isIndexUsableForQueryBranchRewrite(index, indexTableMap.get(index));
+ if(canApply){
+ canApply = checkIfAllRewriteCriteriaIsMet(canApplyCtx);
+ break;
+ }
+ }
+ indexTableName = index.getIndexTableName();
+
+ if(canApply && topOps.containsValue(topOp)) {
+ Iterator topOpNamesItr = topOps.keySet().iterator();
+ while(topOpNamesItr.hasNext()){
+ String topOpName = topOpNamesItr.next();
+ if(topOps.get(topOpName).equals(topOp)){
+ tsOpToProcess.put(topOpName, canApplyCtx);
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ return canApply;
+ }
+
+
+ /**
+ * Method to rewrite the input query if all optimization criteria is passed.
+ * The method iterates over the tsOpToProcess {@link ArrayList} to apply the rewrites
+ *
+ * @throws SemanticException
+ */
+ private void rewriteOriginalQuery() throws SemanticException{
+ HashMap> topOpMap = parseContext.getTopOps();
+ Iterator tsOpItr = tsOpToProcess.keySet().iterator();
+ while(tsOpItr.hasNext()){
+ baseTableName = tsOpItr.next();
+ RewriteCanApplyCtx canApplyCtx = tsOpToProcess.get(baseTableName);
+ TableScanOperator topOp = (TableScanOperator) topOpMap.get(baseTableName);
+
+ /* This part of the code checks if the 'REMOVE_GROUP_BY' value in RewriteVars enum is set to true.
+ * If yes, it sets the environment for the RewriteRemoveGroupbyCtx context and invokes
+ * method to apply rewrite by removing group by construct operators from the original operator tree.
+ * */
+ if(canApplyCtx.getBoolVar(hiveConf, RewriteVars.REMOVE_GROUP_BY)){
+ //Context for removing the group by construct operators from the operator tree
+ RewriteRemoveGroupbyCtx removeGbyCtx = RewriteRemoveGroupbyCtx.getInstance(parseContext, hiveDb, indexTableName);
+ removeGbyCtx.invokeRemoveGbyProc(topOp);
+ //Getting back new parseContext and new OpParseContext after GBY-RS-GBY is removed
+ parseContext = removeGbyCtx.getParseContext();
+ parseContext.setOpParseCtx(removeGbyCtx.getOpc());
+ LOG.info("Finished Group by Remove");
+ }
+
+ /* This part of the code checks if the 'SHOULD_APPEND_SUBQUERY' value in RewriteVars enum is set to true.
+ * If yes, it sets the environment for the RewriteIndexSubqueryCtx context and invokes
+ * method to append a new subquery that scans over the index table rather than the original table.
+ * We first create the subquery context, then copy the RowSchema/RowResolver from subquery to original operator tree.
+ * */
+ if(canApplyCtx.getBoolVar(hiveConf, RewriteVars.SHOULD_APPEND_SUBQUERY)){
+ //Context for appending a subquery to scan over the index table
+ RewriteIndexSubqueryCtx subqueryCtx = RewriteIndexSubqueryCtx.getInstance(parseContext, indexTableName, baseTableName,
+ canApplyCtx.getSelectColumnsList());
+ subqueryCtx.createSubqueryContext();
+
+ HashMap subqTopOpMap = subqueryCtx.getSubqueryPctx().getTopToTable();
+ Iterator subqTopOpItr = subqTopOpMap.keySet().iterator();
+ TableScanOperator subqTopOp = null;
+ if(subqTopOpItr.hasNext()){
+ subqTopOp = subqTopOpItr.next();
+ subqueryCtx.invokeSubquerySelectSchemaProc(subqTopOp);
+ LOG.info("Finished Fetching subquery select schema");
+ subqueryCtx.invokeFixAllOperatorSchemasProc(topOp);
+ }
+
+ parseContext = subqueryCtx.getParseContext();
+ LOG.info("Finished appending subquery");
+ }
+ }
+
+ LOG.info("Finished Rewriting query");
+
+ }
+
+ private String getName() {
+ return "RewriteGBUsingIndex";
+ }
+
+
+ /**
+ * This method logs the reason for which we cannot apply the rewrite optimization.
+ * @return
+ */
+ boolean checkIfAllRewriteCriteriaIsMet(RewriteCanApplyCtx canApplyCtx){
+ if (canApplyCtx.getBoolVar(hiveConf, RewriteVars.QUERY_HAS_DISTRIBUTE_BY)){
+ LOG.info("Query has distributeby clause, " +
+ "that is not supported with " + getName() + " optimization" );
+ return false;
+ }
+ if (canApplyCtx.getBoolVar(hiveConf, RewriteVars.QUERY_HAS_SORT_BY)){
+ LOG.info("Query has sortby clause, " +
+ "that is not supported with " + getName() + " optimization" );
+ return false;
+ }
+ if (canApplyCtx.getBoolVar(hiveConf, RewriteVars.QUERY_HAS_ORDER_BY)){
+ LOG.info("Query has orderby clause, " +
+ "that is not supported with " + getName() + " optimization" );
+ return false;
+ }
+ if (canApplyCtx.getIntVar(hiveConf, RewriteVars.AGG_FUNC_CNT) > 1 ){
+ LOG.info("More than 1 agg funcs: " +
+ "Not supported by " + getName() + " optimization" );
+ return false;
+ }
+ if (canApplyCtx.getBoolVar(hiveConf, RewriteVars.AGG_FUNC_IS_NOT_COUNT)){
+ LOG.info("Agg func other than count is " +
+ "not supported by " + getName() + " optimization" );
+ return false;
+ }
+ if (canApplyCtx.getBoolVar(hiveConf, RewriteVars.COUNT_ON_ALL_COLS)){
+ LOG.info("Currently count function needs group by on key columns. This is a count(*) case., "
+ + "Cannot apply this " + getName() + " optimization" );
+ return false;
+ }
+ if (canApplyCtx.getBoolVar(hiveConf, RewriteVars.AGG_FUNC_COLS_FETCH_EXCEPTION)){
+ LOG.info("Got exception while locating child col refs " +
+ "of agg func, skipping " + getName() + " optimization" );
+ return false;
+ }
+ if (canApplyCtx.getBoolVar(hiveConf, RewriteVars.WHR_CLAUSE_COLS_FETCH_EXCEPTION)){
+ LOG.info("Got exception while locating child col refs for where clause, "
+ + "skipping " + getName() + " optimization" );
+ return false;
+ }
+ if (canApplyCtx.getBoolVar(hiveConf, RewriteVars.SEL_CLAUSE_COLS_FETCH_EXCEPTION)){
+ LOG.info("Got exception while locating child col refs for select list, "
+ + "skipping " + getName() + " optimization" );
+ return false;
+ }
+ if (canApplyCtx.getBoolVar(hiveConf, RewriteVars.GBY_KEYS_FETCH_EXCEPTION)){
+ LOG.info("Got exception while locating child col refs for GroupBy key, "
+ + "skipping " + getName() + " optimization" );
+ return false;
+ }
+ return true;
+ }
+
+
+
+ /**
+ * This block of code iterates over the topToTable map from ParseContext
+ * to determine if the query has a scan over multiple tables.
+ * @return
+ */
+ boolean ifQueryHasMultipleTables(){
+ HashMap topToTable = parseContext.getTopToTable();
+ Iterator valuesItr = topToTable.values().iterator();
+ Set tableNameSet = new HashSet();
+ while(valuesItr.hasNext()){
+ Table table = valuesItr.next();
+ tableNameSet.add(table.getTableName());
+ }
+ if(tableNameSet.size() > 1){
+ LOG.info("Query has more than one table " +
+ "that is not supported with " + getName() + " optimization" );
+ return true;
+ }
+ return false;
+ }
+
+
+ /**
+ * Given a base table meta data, and a list of index types for which we need to find a matching index,
+ * this method returns a list of matching index tables.
+ * @param baseTableMetaData
+ * @param matchIndexTypes
+ * @return
+ */
+ List getIndexes(Table baseTableMetaData, List matchIndexTypes) {
+ List matchingIndexes = new ArrayList();
+ List indexesOnTable = null;
+
+ try {
+ short maxNumOfIndexes = 1024; // XTODO: Hardcoding. Need to know if
+ // there's a limit (and what is it) on
+ // # of indexes that can be created
+ // on a table. If not, why is this param
+ // required by metastore APIs?
+ indexesOnTable = baseTableMetaData.getAllIndexes(maxNumOfIndexes);
+
+ } catch (HiveException e) {
+ return matchingIndexes; // Return empty list (trouble doing rewrite
+ // shouldn't stop regular query execution,
+ // if there's serious problem with metadata
+ // or anything else, it's assumed to be
+ // checked & handled in core hive code itself.
+ }
+
+ for (int i = 0; i < indexesOnTable.size(); i++) {
+ Index index = null;
+ index = indexesOnTable.get(i);
+ // The handler class implies the type of the index (e.g. compact
+ // summary index would be:
+ // "org.apache.hadoop.hive.ql.index.compact.CompactIndexHandler").
+ String indexType = index.getIndexHandlerClass();
+ for (int j = 0; j < matchIndexTypes.size(); j++) {
+ if (indexType.equals(matchIndexTypes.get(j))) {
+ matchingIndexes.add(index);
+ break;
+ }
+ }
+ }
+ return matchingIndexes;
+ }
+
+
+ /**
+ * We retrieve the list of index tables on the current table (represented by the TableScanOperator)
+ * which can be used to apply rewrite on the original query
+ * and return if there are no index tables to be used for rewriting the input query.
+ *
+ * @param topOp
+ * @return
+ */
+ HashMap> getIndexTableInfoForRewrite(TableScanOperator topOp) {
+ HashMap> indexTableMap = null;
+ TableScanOperator ts = (TableScanOperator) topOp;
+ Table tsTable = parseContext.getTopToTable().get(ts);
+ if (tsTable != null) {
+ List idxType = new ArrayList();
+ idxType.add(SUPPORTED_INDEX_TYPE);
+ List indexTables = getIndexes(tsTable, idxType);
+ if (indexTables.size() == 0) {
+ LOG.info("Table " + baseTableName + " does not have compact index. " +
+ "Cannot apply " + getName() + " optimization" );
+ }else{
+ indexTableMap = populateIndexToKeysMap(indexTables);
+ }
+ }
+ return indexTableMap;
+ }
+
+
+ /**
+ * This code block iterates over indexes on the table and picks
+ * up the first index that satisfies the rewrite criteria.
+ * @param indexTables
+ * @return
+ */
+ HashMap> populateIndexToKeysMap(List indexTables){
+ Index index = null;
+ Hive hiveInstance = hiveDb;
+ HashMap> indexToKeysMap = new LinkedHashMap>();
+
+ for (int idxCtr = 0; idxCtr < indexTables.size(); idxCtr++) {
+ final Set indexKeyNames = new LinkedHashSet();
+ index = indexTables.get(idxCtr);
+
+ //Getting index key columns
+ StorageDescriptor sd = index.getSd();
+ List idxColList = sd.getCols();
+ for (FieldSchema fieldSchema : idxColList) {
+ indexKeyNames.add(fieldSchema.getName());
+ }
+
+
+ // Check that the index schema is as expected. This code block should
+ // catch problems of this rewrite breaking when the CompactIndexHandler
+ // index is changed.
+ // This dependency could be better handled by doing init-time check for
+ // compatibility instead of this overhead for every rewrite invocation.
+ ArrayList idxTblColNames = new ArrayList();
+ try {
+ Table idxTbl = hiveInstance.getTable(index.getDbName(),
+ index.getIndexTableName());
+ for (FieldSchema idxTblCol : idxTbl.getCols()) {
+ idxTblColNames.add(idxTblCol.getName());
+ }
+ } catch (HiveException e) {
+ LOG.info("Got exception while locating index table, " +
+ "skipping " + getName() + " optimization" );
+ return indexToKeysMap;
+ }
+ assert(idxTblColNames.contains(COMPACT_IDX_BUCKET_COL));
+ assert(idxTblColNames.contains(COMPACT_IDX_OFFSETS_ARRAY_COL));
+ assert(idxTblColNames.size() == indexKeyNames.size() + 2);
+
+ //we add all index tables which can be used for rewrite and defer the decision of using a particular index for later
+ //this is to allow choosing a index if a better mechanism is designed later to chose a better rewrite
+ indexToKeysMap.put(index, indexKeyNames);
+ }
+ return indexToKeysMap;
+
+ }
+
+
+
+
+}
+
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/RewriteIndexSubqueryCtx.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/RewriteIndexSubqueryCtx.java
new file mode 100644
index 0000000..433e528
--- /dev/null
+++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/RewriteIndexSubqueryCtx.java
@@ -0,0 +1,298 @@
+package org.apache.hadoop.hive.ql.optimizer;
+
+import java.io.Serializable;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.Stack;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.hive.ql.exec.ColumnInfo;
+import org.apache.hadoop.hive.ql.exec.Operator;
+import org.apache.hadoop.hive.ql.lib.DefaultGraphWalker;
+import org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher;
+import org.apache.hadoop.hive.ql.lib.Dispatcher;
+import org.apache.hadoop.hive.ql.lib.GraphWalker;
+import org.apache.hadoop.hive.ql.lib.Node;
+import org.apache.hadoop.hive.ql.lib.NodeProcessor;
+import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx;
+import org.apache.hadoop.hive.ql.lib.PreOrderWalker;
+import org.apache.hadoop.hive.ql.lib.Rule;
+import org.apache.hadoop.hive.ql.lib.RuleRegExp;
+import org.apache.hadoop.hive.ql.parse.ParseContext;
+import org.apache.hadoop.hive.ql.parse.RowResolver;
+import org.apache.hadoop.hive.ql.parse.SemanticException;
+import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator;
+
+/**
+ * RewriteIndexSubqueryCtx class stores the context for the {@link RewriteIndexSubqueryProcFactory} processor factory methods
+ *
+ */
+public class RewriteIndexSubqueryCtx implements NodeProcessorCtx {
+
+ private RewriteIndexSubqueryCtx(ParseContext parseContext, String indexTableName,
+ String baseTableName, Set selectColumnNames){
+ //this prevents the class from getting instantiated
+ this.parseContext = parseContext;
+ this.indexName = indexTableName;
+ this.baseTableName = baseTableName;
+ this.selectColumnNames = selectColumnNames;
+ }
+
+ public static RewriteIndexSubqueryCtx getInstance(ParseContext parseContext, String indexTableName,
+ String baseTableName, Set selectColumnNames){
+ return new RewriteIndexSubqueryCtx(parseContext, indexTableName, baseTableName, selectColumnNames );
+ }
+ protected final Log LOG = LogFactory.getLog(RewriteIndexSubqueryCtx.class.getName());
+
+ //This is populated in RewriteIndexSubqueryProcFactory's NewQuerySelectSchemaProc processor with the colExprMap of the
+ //SelectOperator whose parent is TableScanOperator
+ private Map newSelColExprMap = new LinkedHashMap();
+ //The next two data structures are populated in RewriteIndexSubqueryProcFactory's NewQuerySelectSchemaProc processor
+ //with the colExprMap of the SelectOperator whose child is GroupByOperator
+ private final ArrayList newSelColList = new ArrayList();
+
+ // Initialise all data structures required to copy RowResolver, RowSchema, outputColumnNames, colList, colExprMap
+ //from subquery DAG to original DAG operators
+ private final ArrayList newOutputCols = new ArrayList();
+ private Map newColExprMap = new HashMap();
+ private final ArrayList newColList = new ArrayList();
+ private final ArrayList newRS = new ArrayList();
+ private RowResolver newRR = new RowResolver();
+
+ //This is populated in RewriteIndexSubqueryProcFactory's SubquerySelectSchemaProc processor for later
+ //use in NewQuerySelectSchemaProc processor
+ private final Map aliasToInternal = new LinkedHashMap();
+
+ // Get the parentOperators List for FileSinkOperator. We need this later to set the
+ // parentOperators for original DAG operator
+ private final List> subqFSParentList = new ArrayList>();
+
+ // We need the reference to this SelectOperator so that the original DAG can be appended here
+ private Operator extends Serializable> subqSelectOp;
+
+ //We replace the original TS operator with new TS operator from subquery context to scan over the index table
+ //rather than the original table
+ private Operator extends Serializable> newTSOp;
+
+ private final ParseContext parseContext;
+ private final Set selectColumnNames;
+ private final String indexName;
+ private final String baseTableName;
+
+ private ParseContext subqueryPctx = null;
+ private ParseContext newDAGCtx = null;
+
+ //We need the GenericUDAFEvaluator for GenericUDAF function "sum" when we append subquery to original operator tree
+ private GenericUDAFEvaluator eval = null;
+
+
+ public Set getSelectColumnNames() {
+ return selectColumnNames;
+ }
+
+ public ArrayList getNewOutputCols() {
+ return newOutputCols;
+ }
+
+ public Map getNewColExprMap() {
+ return newColExprMap;
+ }
+
+ public void setNewColExprMap(Map newColExprMap) {
+ this.newColExprMap = newColExprMap;
+ }
+
+ public ArrayList getNewColList() {
+ return newColList;
+ }
+
+ public ArrayList getNewRS() {
+ return newRS;
+ }
+
+ public RowResolver getNewRR() {
+ return newRR;
+ }
+
+ public void setNewRR(RowResolver newRR) {
+ this.newRR = newRR;
+ }
+
+ public List> getSubqFSParentList() {
+ return subqFSParentList;
+ }
+
+ public Operator extends Serializable> getSubqSelectOp() {
+ return subqSelectOp;
+ }
+
+ public void setSubqSelectOp(Operator extends Serializable> subqSelectOp) {
+ this.subqSelectOp = subqSelectOp;
+ }
+
+ public Map getAliasToInternal() {
+ return aliasToInternal;
+ }
+
+ public ParseContext getParseContext() {
+ return parseContext;
+ }
+
+ public ParseContext getSubqueryPctx() {
+ return subqueryPctx;
+ }
+
+ public void setSubqueryPctx(ParseContext subqueryPctx) {
+ this.subqueryPctx = subqueryPctx;
+ }
+
+ public ParseContext getNewDAGCtx() {
+ return newDAGCtx;
+ }
+
+ public void setNewDAGCtx(ParseContext newDAGCtx) {
+ this.newDAGCtx = newDAGCtx;
+ }
+
+ public Map getNewSelColExprMap() {
+ return newSelColExprMap;
+ }
+
+ public void setNewSelColExprMap(Map newSelColExprMap) {
+ this.newSelColExprMap = newSelColExprMap;
+ }
+
+ public ArrayList getNewSelColList() {
+ return newSelColList;
+ }
+
+ public String getIndexName() {
+ return indexName;
+ }
+
+ public String getBaseTableName() {
+ return baseTableName;
+ }
+
+ public GenericUDAFEvaluator getEval() {
+ return eval;
+ }
+
+ public void setEval(GenericUDAFEvaluator eval) {
+ this.eval = eval;
+ }
+
+
+ public void setNewTSOp(Operator extends Serializable> newTSOp) {
+ this.newTSOp = newTSOp;
+ }
+
+ public Operator extends Serializable> getNewTSOp() {
+ return newTSOp;
+ }
+
+ /**
+ * We construct the string command for subquery using index key columns
+ * and use the {@link RewriteParseContextGenerator} to generate a operator tree
+ * and its ParseContext for the subquery string command
+ */
+ void createSubqueryContext() {
+ String selKeys = "";
+ for (String key : selectColumnNames) {
+ selKeys += key + ",";
+ }
+ String subqueryCommand = "select " + selKeys + " size(`_offsets`) as CNT from " + indexName;
+ subqueryPctx = RewriteParseContextGenerator.generateOperatorTree(parseContext.getConf(), subqueryCommand);
+
+ }
+
+ /**
+ * Walk the original operator tree using the {@link DefaultGraphWalker} using the rules.
+ * Each of the rules invoke respective methods from the {@link RewriteIndexSubqueryProcFactory}
+ * to
+ * @param topOp
+ * @throws SemanticException
+ */
+ public void invokeSubquerySelectSchemaProc(Operator extends Serializable> topOp) throws SemanticException{
+ Map opRules = new LinkedHashMap();
+ //removes the subquery FileSinkOperator from subquery OpParseContext as
+ //we do not need to append FS operator to original operator tree
+ opRules.put(new RuleRegExp("R1", "FS%"), RewriteIndexSubqueryProcFactory.getSubqueryFileSinkProc());
+ //copies the RowSchema, outputColumnNames, colList, RowResolver, columnExprMap to RewriteIndexSubqueryCtx data structures
+ opRules.put(new RuleRegExp("R2", "SEL%"), RewriteIndexSubqueryProcFactory.getSubquerySelectSchemaProc());
+
+ // The dispatcher fires the processor corresponding to the closest matching
+ // rule and passes the context along
+ Dispatcher disp = new DefaultRuleDispatcher(getDefaultProc(), opRules, this);
+ GraphWalker ogw = new DefaultGraphWalker(disp);
+
+ // Create a list of topop nodes
+ ArrayList topNodes = new ArrayList();
+ topNodes.add(topOp);
+ ogw.startWalking(topNodes, null);
+
+ }
+
+
+
+ /**
+ * Walk the original operator tree using the {@link PreOrderWalker} using the rules.
+ * This method appends the subquery operator tree to original operator tree
+ * It replaces the original table scan operator with index table scan operator
+ * Method also copies the information from {@link RewriteIndexSubqueryCtx} to
+ * appropriate operators from the original operator tree
+ * @param topOp
+ * @throws SemanticException
+ */
+ public void invokeFixAllOperatorSchemasProc(Operator extends Serializable> topOp) throws SemanticException{
+ Map opRules = new LinkedHashMap();
+
+ //appends subquery operator tree to original operator tree
+ opRules.put(new RuleRegExp("R1", "TS%"), RewriteIndexSubqueryProcFactory.getAppendSubqueryToOriginalQueryProc());
+
+ //copies RowSchema, outputColumnNames, colList, RowResolver, columnExprMap from RewriteIndexSubqueryCtx data structures
+ // to SelectOperator of original operator tree
+ opRules.put(new RuleRegExp("R2", "SEL%"), RewriteIndexSubqueryProcFactory.getNewQuerySelectSchemaProc());
+ //Manipulates the ExprNodeDesc from FilterOperator predicate list as per colList data structure from RewriteIndexSubqueryCtx
+ opRules.put(new RuleRegExp("R3", "FIL%"), RewriteIndexSubqueryProcFactory.getNewQueryFilterSchemaProc());
+ //Manipulates the ExprNodeDesc from GroupByOperator aggregation list, parameters list \
+ //as per colList data structure from RewriteIndexSubqueryCtx
+ opRules.put(new RuleRegExp("R4", "GBY%"), RewriteIndexSubqueryProcFactory.getNewQueryGroupbySchemaProc());
+
+ // The dispatcher fires the processor corresponding to the closest matching
+ // rule and passes the context along
+ Dispatcher disp = new DefaultRuleDispatcher(getDefaultProc(), opRules, this);
+ GraphWalker ogw = new PreOrderWalker(disp);
+
+ // Create a list of topop nodes
+ ArrayList topNodes = new ArrayList();
+ topNodes.add(topOp);
+
+ ogw.startWalking(topNodes, null);
+
+ }
+
+
+ /**
+ * Default procedure for {@link DefaultRuleDispatcher}
+ * @return
+ */
+ private NodeProcessor getDefaultProc() {
+ return new NodeProcessor() {
+ @Override
+ public Object process(Node nd, Stack stack,
+ NodeProcessorCtx procCtx, Object... nodeOutputs) throws SemanticException {
+ return null;
+ }
+ };
+ }
+
+
+
+}
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/RewriteIndexSubqueryProcFactory.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/RewriteIndexSubqueryProcFactory.java
new file mode 100644
index 0000000..989c6aa
--- /dev/null
+++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/RewriteIndexSubqueryProcFactory.java
@@ -0,0 +1,605 @@
+package org.apache.hadoop.hive.ql.optimizer;
+
+import java.io.Serializable;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.Stack;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.hive.ql.exec.ColumnInfo;
+import org.apache.hadoop.hive.ql.exec.FileSinkOperator;
+import org.apache.hadoop.hive.ql.exec.FilterOperator;
+import org.apache.hadoop.hive.ql.exec.GroupByOperator;
+import org.apache.hadoop.hive.ql.exec.Operator;
+import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator;
+import org.apache.hadoop.hive.ql.exec.RowSchema;
+import org.apache.hadoop.hive.ql.exec.SelectOperator;
+import org.apache.hadoop.hive.ql.exec.TableScanOperator;
+import org.apache.hadoop.hive.ql.lib.Node;
+import org.apache.hadoop.hive.ql.lib.NodeProcessor;
+import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx;
+import org.apache.hadoop.hive.ql.metadata.Table;
+import org.apache.hadoop.hive.ql.parse.ParseContext;
+import org.apache.hadoop.hive.ql.parse.RowResolver;
+import org.apache.hadoop.hive.ql.parse.SemanticException;
+import org.apache.hadoop.hive.ql.plan.AggregationDesc;
+import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc;
+import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
+import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc;
+import org.apache.hadoop.hive.ql.plan.FilterDesc;
+import org.apache.hadoop.hive.ql.plan.GroupByDesc;
+import org.apache.hadoop.hive.ql.plan.SelectDesc;
+import org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo;
+import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
+import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
+
+/**
+ * Factory of processors used in {@link RewriteGBUsingIndex} (see invokeSubquerySelectSchemaProc(..) method)
+ * Each of the processors are invoked according to a rule and serve to append subquery to original operator tree.
+ *
+ * This subquery scans over the index table rather than the original table.
+ * IT replaces the count(literal)/count(index_key) function in the original select operator
+ * with sum(cnt) where cnt is size(_offsets) from subquery select operator.
+ *
+ * This change necessitates change in the rowSchema, colList, colExprMap, rowResolver of all the SelectOperator's in original
+ * operator tree. It also requires to set appropriate predicate parameters and group-by aggregation parameters in original
+ * operator tree. Each of the processors in this Factory take care of these changes.
+ *
+ */
+public final class RewriteIndexSubqueryProcFactory {
+ protected final static Log LOG = LogFactory.getLog(RewriteIndexSubqueryProcFactory.class.getName());
+ private static RewriteIndexSubqueryCtx subqueryCtx = null;
+
+ private RewriteIndexSubqueryProcFactory() {
+ //this prevents the class from getting instantiated
+ }
+
+ /**
+ * This processor retrieves the rowSchema, rowResolver, colList, colExprMap and outputColumnNames data structures
+ * from the SelectOperator and its descriptor(SelectDesc). It stores the information in the RewriteIndexSubqueryCtx instance
+ * for later use in correcting the schema of original operator tree.
+ *
+ */
+ private static class SubquerySelectSchemaProc implements NodeProcessor {
+ public Object process(Node nd, Stack stack, NodeProcessorCtx ctx,
+ Object... nodeOutputs) throws SemanticException {
+ SelectOperator operator = (SelectOperator)nd;
+ subqueryCtx = (RewriteIndexSubqueryCtx)ctx;
+
+ //We need to clear this every time in cases where there are multiple operator tree paths with multiple SelectOperators
+ subqueryCtx.getNewOutputCols().clear();
+ subqueryCtx.getNewColExprMap().clear();
+ subqueryCtx.getNewColList().clear();
+ subqueryCtx.getNewRS().clear();
+ subqueryCtx.setNewRR(new RowResolver());
+
+
+ RowResolver oldRR = subqueryCtx.getSubqueryPctx().getOpParseCtx().get(operator).getRowResolver();
+ SelectDesc oldConf = (SelectDesc) operator.getConf();
+ Map oldColumnExprMap = operator.getColumnExprMap();
+ ArrayList oldColList = oldConf.getColList();
+
+ //We create the mapping of column name alias to internal name for later use in correcting original operator tree
+ ArrayList schemaSign = operator.getSchema().getSignature();
+ for (ColumnInfo columnInfo : schemaSign) {
+ String internal = columnInfo.getInternalName();
+ String alias = columnInfo.getAlias();
+ subqueryCtx.getAliasToInternal().put(alias, internal);
+ }
+
+ /**outputColumnNames**/
+ String internalName = null;
+ for(int i=0; i < oldConf.getOutputColumnNames().size(); i++){
+ internalName = oldConf.getOutputColumnNames().get(i);
+ //Populate all output columns (required by SelectOperators in original DAG) in RewriteIndexSubqueryCtx
+ subqueryCtx.getNewOutputCols().add(new String(internalName));
+
+ /**colExprMap**/
+ if(oldColumnExprMap != null){
+ ExprNodeDesc end = oldColumnExprMap.get(internalName); //in case of simple column names
+ if(end instanceof ExprNodeColumnDesc){
+ ExprNodeColumnDesc oldDesc = (ExprNodeColumnDesc)end ;
+ ExprNodeColumnDesc newDesc = (ExprNodeColumnDesc) oldDesc.clone();
+ newDesc.setColumn(internalName);
+ //Populate columnExprMap (required by SelectOperator and FilterOperator in original DAG) in RewriteIndexSubqueryCtx
+ subqueryCtx.getNewColExprMap().put(internalName, newDesc);
+ }else if(end instanceof ExprNodeGenericFuncDesc){ //in case of functions on columns
+ ExprNodeGenericFuncDesc oldDesc = (ExprNodeGenericFuncDesc)end ;
+ ExprNodeGenericFuncDesc newDesc = (ExprNodeGenericFuncDesc) oldDesc.clone();
+ List childExprs = newDesc.getChildExprs();
+ List newChildExprs = new ArrayList();
+ for (ExprNodeDesc childEnd : childExprs) { //we have the list of columns here
+ if(childEnd instanceof ExprNodeColumnDesc){
+ ((ExprNodeColumnDesc) childEnd).setColumn(internalName);
+ newChildExprs.add(childEnd);
+ }
+ newDesc.setChildExprs(newChildExprs);
+ //Populate columnExprMap (required by SelectOperator and FilterOperator in original DAG) in RewriteIndexSubqueryCtx
+ subqueryCtx.getNewColExprMap().put(internalName, newDesc);
+ }
+ }
+ }
+
+ /**colList**/
+ if(oldColList != null){
+ ExprNodeDesc exprNodeDesc = oldColList.get(i);
+ if(exprNodeDesc instanceof ExprNodeColumnDesc){//in case of simple column names
+ ExprNodeColumnDesc newDesc = (ExprNodeColumnDesc) exprNodeDesc.clone();
+ newDesc.setColumn(internalName);
+ //Populate colList (required by SelectOperators in original DAG) in RewriteIndexSubqueryCtx
+ subqueryCtx.getNewColList().add(newDesc);
+ }else if(exprNodeDesc instanceof ExprNodeGenericFuncDesc){//in case of functions on columns
+ ExprNodeGenericFuncDesc oldDesc = (ExprNodeGenericFuncDesc)exprNodeDesc ;
+ ExprNodeGenericFuncDesc newDesc = (ExprNodeGenericFuncDesc) oldDesc.clone();
+ List childExprs = newDesc.getChildExprs();
+ List newChildExprs = new ArrayList();
+ for (ExprNodeDesc childEnd : childExprs) {//we have the list of columns here
+ if(childEnd instanceof ExprNodeColumnDesc){
+ ((ExprNodeColumnDesc) childEnd).setColumn(internalName);
+ newChildExprs.add(childEnd);
+ }
+ newDesc.setChildExprs(newChildExprs);
+ //Populate colList (required by SelectOperators in original DAG) in RewriteIndexSubqueryCtx
+ subqueryCtx.getNewColList().add(newDesc);
+ }
+ }
+ }
+ }
+
+ /**RowSchema and RowResolver**/
+ for (int i = 0; i < subqueryCtx.getNewOutputCols().size(); i++) {
+ internalName = subqueryCtx.getNewOutputCols().get(i);
+ String[] nm = oldRR.reverseLookup(internalName);
+ ColumnInfo col;
+ try {
+ //We need to set the alias for the new index table subquery
+ col = oldRR.get(nm[0], nm[1]);
+ if(nm[0] == null){
+ nm[0] = "v" + i; //add different alias in case original query has multiple subqueries
+ }
+ // Populate RowResolver and RowSchema (required by SelectOperator and FilterOperator in original DAG) in RewriteIndexSubqueryCtx
+ subqueryCtx.getNewRR().put(nm[0], nm[1], col);
+ subqueryCtx.getNewRS().add(col);
+ } catch (SemanticException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ //We need this SelectOperator from subquery as a reference point to append in original query
+ subqueryCtx.setSubqSelectOp(operator);
+
+ return null;
+ }
+ }
+
+ public static SubquerySelectSchemaProc getSubquerySelectSchemaProc(){
+ return new SubquerySelectSchemaProc();
+ }
+
+
+ /**
+ * We do not need the fileSinkOperator of the subquery operator tree when we append the rest of the subquery operator tree
+ * to the original operator tree. This processor gets rid of this FS operator by removing it from subquery OpParseContext.
+ *
+ */
+ private static class SubqueryFileSinkProc implements NodeProcessor {
+ public Object process(Node nd, Stack stack, NodeProcessorCtx ctx,
+ Object... nodeOutputs) throws SemanticException {
+ FileSinkOperator operator = (FileSinkOperator)nd;
+ subqueryCtx = (RewriteIndexSubqueryCtx)ctx;
+ //Store the list of FileSinkOperator's parent operators as we later append the original query
+ //at the end of the subquery operator tree (without the FileSinkOperator).
+ subqueryCtx.getSubqFSParentList().addAll(operator.getParentOperators());
+ subqueryCtx.getSubqueryPctx().getOpParseCtx().remove(operator);
+ return null;
+ }
+ }
+
+ public static SubqueryFileSinkProc getSubqueryFileSinkProc(){
+ return new SubqueryFileSinkProc();
+ }
+
+ /**
+ * This processor appends the subquery operator tree to the original operator tree.
+ * Since genPlan(..) method from the SemanticAnalyzer creates the operator tree bottom-up i.e.
+ * FROM-WHERE-GROUPBY-ORDERBY-SELECT etc, any query with nested subqueries will have the TableScanOperator of the
+ * innermost subquery as the top operator in the topOps and topToTable maps.
+ *
+ * Any subquery which is a part of the from clause
+ * (eg: SELECT * FROM (SELECT DISTINCT key, value FROM tbl) v1 WHERE v1.value = 2;) always has its
+ * DAG operator tree appended before the operator tree of the enclosing query.
+ * For example, for the above query, the operator tree is:
+ * SEL(1)[subq]--->GBY(2)[subq]--->RS(3)[subq]--->GBY(4)[subq]--->SEL(5)[subq]--->FIL(6)[orig]--->SEL(7)[orig]--->FS(8)[orig]>
+ *
+ * We replace the TableScanOperator (TS) of the original operator tree with the whole subquery operator tree (without the
+ * FileSinkOperator of the subquery operator tree).
+ *
+ */
+ private static class AppendSubqueryToOriginalQueryProc implements NodeProcessor {
+ public Object process(Node nd, Stack stack, NodeProcessorCtx ctx,
+ Object... nodeOutputs) throws SemanticException {
+ TableScanOperator operator = (TableScanOperator)nd;
+ subqueryCtx = (RewriteIndexSubqueryCtx)ctx;
+ List> origChildrenList = operator.getChildOperators();
+
+ /* origChildrenList has the child operators for the TableScanOperator of the original DAG
+ * We need to get rid of the TS operator of original DAG and append rest of the tree to the sub-query operator DAG
+ * This code sets the parentOperators of first operator in origChildrenList to subqFSParentList.
+ * subqFSParentList contains the parentOperators list of the FileSinkOperator of the sub-query operator DAG
+ *
+ * subqLastOp is the last SelectOperator of sub-query DAG. The rest of the original operator DAG needs to be appended here
+ * Hence, set the subqLastOp's child operators to be origChildrenList
+ *
+ * */
+ if(origChildrenList != null && origChildrenList.size() > 0){
+ origChildrenList.get(0).setParentOperators(subqueryCtx.getSubqFSParentList());
+ }
+ if(subqueryCtx.getSubqSelectOp() != null){
+ subqueryCtx.getSubqSelectOp().setChildOperators(origChildrenList);
+ }
+
+ /* The operator DAG plan is generated in the order FROM-WHERE-GROUPBY-ORDERBY-SELECT
+ * We have appended the original operator DAG at the end of the sub-query operator DAG
+ * as the sub-query will always be a part of FROM processing
+ * Now we need to insert the final sub-query+original DAG to the original ParseContext
+ */
+
+ HashMap> subqTopMap = subqueryCtx.getSubqueryPctx().getTopOps();
+ Iterator subqTabItr = subqTopMap.keySet().iterator();
+ String subqTab = subqTabItr.next();
+ Operator extends Serializable> subqOp = subqTopMap.get(subqTab);
+ Table tbl = subqueryCtx.getSubqueryPctx().getTopToTable().get(subqOp);
+
+ //remove original TableScanOperator from the topToTable map
+ //Put the new TableScanOperator (top operator of the subquery operator tree) to topToTable map
+ subqueryCtx.getParseContext().getTopToTable().remove(operator);
+ subqueryCtx.getParseContext().getTopToTable().put((TableScanOperator) subqOp, tbl);
+
+ String tabAlias = "";
+ if(subqueryCtx.getBaseTableName().contains(":")){
+ String[] tabToAlias = subqueryCtx.getBaseTableName().split(":");
+ if(tabToAlias.length > 1){
+ tabAlias = tabToAlias[0] + ":";
+ }
+ }
+ //remove original table and operator tree mapping from topOps
+ //put the new table alias adn subquery index table as the key and the new operator tree as value in topOps
+ subqueryCtx.getParseContext().getTopOps().remove(subqueryCtx.getBaseTableName());
+ subqueryCtx.getParseContext().getTopOps().put(tabAlias + subqTab, subqOp);
+
+ //we need this later
+ subqueryCtx.setNewTSOp(subqOp);
+
+ //remove original TableScanOperator from the original OpParsecontext
+ //add all values from the subquery OpParseContext to the original OpParseContext
+ subqueryCtx.getParseContext().getOpParseCtx().remove(operator);
+ subqueryCtx.getParseContext().getOpParseCtx().putAll(subqueryCtx.getSubqueryPctx().getOpParseCtx());
+ LOG.info("Finished appending subquery");
+ return null;
+ }
+ }
+
+ public static AppendSubqueryToOriginalQueryProc getAppendSubqueryToOriginalQueryProc(){
+ return new AppendSubqueryToOriginalQueryProc();
+ }
+
+
+
+ /**
+ * NewQuerySelectSchemaProc.
+ *
+ */
+ private static class NewQuerySelectSchemaProc implements NodeProcessor {
+ public Object process(Node nd, Stack stack, NodeProcessorCtx ctx,
+ Object... nodeOutputs) throws SemanticException {
+ SelectOperator operator = (SelectOperator)nd;
+ subqueryCtx = (RewriteIndexSubqueryCtx)ctx;
+
+ List> parentOps = operator.getParentOperators();
+ Operator extends Serializable> parentOp = parentOps.iterator().next();
+ List> childOps = operator.getChildOperators();
+ Operator extends Serializable> childOp = childOps.iterator().next();
+
+
+ if(parentOp instanceof TableScanOperator){
+ //We need to copy the colExprMap of this SelectOperator whose parent is TableScanOperator to the
+ //colExprMap of the SelectOperator whose child operator is a GroupByOperator
+ subqueryCtx.setNewSelColExprMap(operator.getColumnExprMap());
+ }else if((!(parentOp instanceof TableScanOperator)) //skip first SelectOperator in operator tree
+ && (!(childOp instanceof FileSinkOperator)) //skip last SelectOperator in operator tree
+ && (!(childOp instanceof ReduceSinkOperator))){ //skip the SelectOperator which appears before a JOIN in operator tree
+
+ //Copy colList and outputColumns for SelectOperator from sub-query DAG SelectOperator
+ //these are all the SelectOperators that come in between the first SelectOperator and last SelectOperator in the operator tree
+ operator.setColumnExprMap(subqueryCtx.getNewColExprMap());
+ subqueryCtx.getParseContext().getOpParseCtx().get(operator).setRowResolver(subqueryCtx.getNewRR());
+ operator.getSchema().setSignature(subqueryCtx.getNewRS());
+ SelectDesc conf = (SelectDesc) operator.getConf();
+ conf.setColList(subqueryCtx.getNewColList());
+ conf.setOutputColumnNames(subqueryCtx.getNewOutputCols());
+ }
+
+ if (childOp instanceof GroupByOperator){
+ //use the original columnExprMap to construct the newColList
+ subqueryCtx.getNewSelColList().clear();
+ /**colList**/
+ Set internalNamesList = operator.getColumnExprMap().keySet();
+ for (String internal : internalNamesList) {
+ ExprNodeDesc end = operator.getColumnExprMap().get(internal).clone();
+ if(end instanceof ExprNodeGenericFuncDesc){
+ List colExprs = ((ExprNodeGenericFuncDesc)end).getChildExprs();
+ for (ExprNodeDesc colExpr : colExprs) {
+ if(colExpr instanceof ExprNodeColumnDesc){
+ if(!subqueryCtx.getNewSelColList().contains(colExpr)){
+ TypeInfo typeInfo = colExpr.getTypeInfo();
+ if(typeInfo instanceof ListTypeInfo){
+ PrimitiveTypeInfo pti = new PrimitiveTypeInfo();
+ pti.setTypeName("int");
+ colExpr.setTypeInfo(pti);
+ }
+ subqueryCtx.getNewSelColList().add(colExpr);
+ }
+ }
+ }
+
+ }else if(end instanceof ExprNodeColumnDesc){
+ if(!subqueryCtx.getNewSelColList().contains(end)){
+ subqueryCtx.getNewSelColList().add(end);
+ }
+ }
+ }
+ //Set the new colExprMap and new colList
+ operator.setColumnExprMap(subqueryCtx.getNewSelColExprMap());
+ SelectDesc selDesc = (SelectDesc) operator.getConf();
+ selDesc.setColList(subqueryCtx.getNewSelColList());
+ }
+
+ return null;
+ }
+ }
+
+ public static NewQuerySelectSchemaProc getNewQuerySelectSchemaProc(){
+ return new NewQuerySelectSchemaProc();
+ }
+
+
+ /**
+ * We need to replace the count(literal) GenericUDAF aggregation function for group-by construct to "sum" GenericUDAF.
+ * This processor creates a new operator tree for a sample query that creates a GroupByOperator with sum aggregation function
+ * and uses that GroupByOperator information to replace the original GroupByOperator aggregation information.
+ * It replaces the AggregationDesc (aggregation descriptor) of the old GroupByOperator with the new Aggregation Desc
+ * of the new GroupByOperator.
+ *
+ * The processor also corrects the RowSchema and group-by keys by replacing the existing internal names with the new internal names.
+ * This change is required as we add a new subquery to the original query which triggers this change.
+ *
+ */
+ private static class NewQueryGroupbySchemaProc implements NodeProcessor {
+ public Object process(Node nd, Stack stack, NodeProcessorCtx ctx,
+ Object... nodeOutputs) throws SemanticException {
+ GroupByOperator operator = (GroupByOperator)nd;
+ subqueryCtx = (RewriteIndexSubqueryCtx)ctx;
+
+ //We need to replace the GroupByOperator which is in groupOpToInputTables map with the new GroupByOperator
+ if(subqueryCtx.getParseContext().getGroupOpToInputTables().containsKey(operator)){
+ //we need to get rif of the alias and construct a query only with the base table name
+ String table = subqueryCtx.getBaseTableName();
+ if(table.contains(":")){
+ String[] aliasAndTab = table.split(":");
+ table = aliasAndTab[1];
+ }
+ String selReplacementCommand = "";
+ if(subqueryCtx.getSelectColumnNames().iterator().hasNext()){
+ //the query contains the sum aggregation GenericUDAF
+ selReplacementCommand = "select sum(" + subqueryCtx.getSelectColumnNames().iterator().next() + ") as TOTAL from " + table
+ + " group by " + subqueryCtx.getSelectColumnNames().iterator().next() + " ";
+ }
+ //create a new ParseContext for the query to retrieve its operator tree, and the required GroupByOperator from it
+ ParseContext newDAGContext = RewriteParseContextGenerator.generateOperatorTree(subqueryCtx.getParseContext().getConf(),
+ selReplacementCommand);
+ subqueryCtx.setNewDAGCtx(newDAGContext);
+
+ //we get our new GroupByOperator here
+ Map> newGbyOpMap = subqueryCtx.getNewDAGCtx().getGroupOpToInputTables();
+ GroupByOperator newGbyOperator = newGbyOpMap.keySet().iterator().next();
+
+ //remove the old GroupByOperator
+ GroupByDesc oldConf = operator.getConf();
+ ArrayList oldAggrList = oldConf.getAggregators();
+ if(oldAggrList != null && oldAggrList.size() > 0){
+ for (AggregationDesc aggregationDesc : oldAggrList) {
+ if(aggregationDesc != null && aggregationDesc.getGenericUDAFName().equals("count")){
+ oldAggrList.remove(aggregationDesc);
+ break;
+ }
+
+ }
+ }
+
+ //Construct the new AggregationDesc to get rid of the current internal names and replace them with new internal names
+ //as required by the operator tree
+ GroupByDesc newConf = newGbyOperator.getConf();
+ ArrayList newAggrList = newConf.getAggregators();
+ if(newAggrList != null && newAggrList.size() > 0){
+ for (AggregationDesc aggregationDesc : newAggrList) {
+ subqueryCtx.setEval(aggregationDesc.getGenericUDAFEvaluator());
+ ArrayList paraList = aggregationDesc.getParameters();
+ for (int i=0; i< paraList.size(); i++) {
+ ExprNodeDesc exprNodeDesc = paraList.get(i);
+ if(exprNodeDesc instanceof ExprNodeColumnDesc){
+ ExprNodeColumnDesc encd = (ExprNodeColumnDesc)exprNodeDesc;
+ String col = "cnt";
+ if(subqueryCtx.getAliasToInternal().containsKey(col)){
+ encd.setColumn(subqueryCtx.getAliasToInternal().get(col));
+ }
+ encd.setTabAlias(null);
+ exprNodeDesc = encd;
+ }
+ paraList.set(i, exprNodeDesc);
+ }
+ oldAggrList.add(aggregationDesc);
+ }
+ }
+
+ //Construct the new colExprMap to get rid of the current internal names and replace them with new internal names
+ //as required by the operator tree
+ Map newGbyColExprMap = new LinkedHashMap();
+ Map oldGbyColExprMap = operator.getColumnExprMap();
+ Set internalNameSet = oldGbyColExprMap.keySet();
+ for (String internal : internalNameSet) {
+ ExprNodeDesc exprNodeDesc = oldGbyColExprMap.get(internal).clone();
+ if(exprNodeDesc instanceof ExprNodeColumnDesc){
+ ExprNodeColumnDesc encd = (ExprNodeColumnDesc)exprNodeDesc;
+ String col = encd.getColumn();
+ if(subqueryCtx.getSelectColumnNames().contains(col)){
+ encd.setColumn(subqueryCtx.getAliasToInternal().get(col));
+ }
+ }else if(exprNodeDesc instanceof ExprNodeGenericFuncDesc){
+ List colExprs = ((ExprNodeGenericFuncDesc)exprNodeDesc).getChildExprs();
+ for (ExprNodeDesc colExpr : colExprs) {
+ if(colExpr instanceof ExprNodeColumnDesc){
+ ExprNodeColumnDesc encd = (ExprNodeColumnDesc)colExpr;
+ String col = encd.getColumn();
+ if(subqueryCtx.getSelectColumnNames().contains(col)){
+ encd.setColumn(subqueryCtx.getAliasToInternal().get(col));
+ }
+ }
+ }
+
+ }
+ newGbyColExprMap.put(internal, exprNodeDesc);
+ }
+
+ //Construct the new group-by keys to get rid of the current internal names and replace them with new internal names
+ //as required by the operator tree
+ ArrayList newGbyKeys = new ArrayList();
+ ArrayList oldGbyKeys = oldConf.getKeys();
+ for (int i =0; i< oldGbyKeys.size(); i++) {
+ ExprNodeDesc exprNodeDesc = oldGbyKeys.get(i).clone();
+ if(exprNodeDesc instanceof ExprNodeColumnDesc){
+ ExprNodeColumnDesc encd = (ExprNodeColumnDesc)exprNodeDesc;
+ String col = encd.getColumn();
+ if(subqueryCtx.getSelectColumnNames().contains(col)){
+ encd.setColumn(subqueryCtx.getAliasToInternal().get(col));
+ }
+ exprNodeDesc = encd;
+ }else if(exprNodeDesc instanceof ExprNodeGenericFuncDesc){
+ ExprNodeGenericFuncDesc engfd = (ExprNodeGenericFuncDesc)exprNodeDesc;
+ List colExprs = engfd.getChildExprs();
+ for (ExprNodeDesc colExpr : colExprs) {
+ if(colExpr instanceof ExprNodeColumnDesc){
+ ExprNodeColumnDesc encd = (ExprNodeColumnDesc)colExpr;
+ String col = encd.getColumn();
+ if(subqueryCtx.getSelectColumnNames().contains(col)){
+ encd.setColumn(subqueryCtx.getAliasToInternal().get(col));
+ }
+
+ }
+ }
+ }
+ newGbyKeys.add(exprNodeDesc);
+ }
+
+ //Construct the new RowSchema. We do not need a alias for the new internalNames
+ RowSchema oldRS = operator.getSchema();
+ ArrayList oldSign = oldRS.getSignature();
+ ArrayList newSign = new ArrayList();
+ for (ColumnInfo columnInfo : oldSign) {
+ columnInfo.setAlias(null);
+ newSign.add(columnInfo);
+ }
+
+ //reset the above data structures in the original GroupByOperator
+ oldRS.setSignature(newSign);
+ operator.setSchema(oldRS);
+ oldConf.setKeys(newGbyKeys);
+ oldConf.setAggregators(oldAggrList);
+ operator.setColumnExprMap(newGbyColExprMap);
+ operator.setConf(oldConf);
+
+ }else{
+ //we just need to reset the GenericUDAFEvaluator and its name for this GroupByOperator whose parent is the
+ //ReduceSinkOperator
+ GroupByDesc childConf = (GroupByDesc) operator.getConf();
+ ArrayList childAggrList = childConf.getAggregators();
+ if(childAggrList != null && childAggrList.size() > 0){
+ for (AggregationDesc aggregationDesc : childAggrList) {
+ aggregationDesc.setGenericUDAFEvaluator(subqueryCtx.getEval());
+ aggregationDesc.setGenericUDAFName("sum");
+ }
+ }
+
+ }
+
+ return null;
+ }
+ }
+
+ public static NewQueryGroupbySchemaProc getNewQueryGroupbySchemaProc(){
+ return new NewQueryGroupbySchemaProc();
+ }
+
+
+ /**
+ * This processor corrects the RowResolver for the FilterOperator of the original operator tree using
+ * the RowResolver obtained from the subquery SelectOperator in SubquerySelectSchemaProc processor.
+ * It also needs to replace the current internal names with new internal names for all instances of the
+ * ExprNodeColumnDesc. It recursively calls the setFilterPredicateCol(..) method to set this information correctly.
+ *
+ */
+ private static class NewQueryFilterSchemaProc implements NodeProcessor {
+ public Object process(Node nd, Stack stack, NodeProcessorCtx ctx,
+ Object... nodeOutputs) throws SemanticException {
+ FilterOperator operator = (FilterOperator)nd;
+ subqueryCtx = (RewriteIndexSubqueryCtx)ctx;
+ //Set new RowResolver
+ operator.getSchema().setSignature(subqueryCtx.getNewRS());
+ subqueryCtx.getParseContext().getOpParseCtx().get(operator).setRowResolver(subqueryCtx.getNewRR());
+
+ //Set correct internalNames
+ FilterDesc conf = operator.getConf();
+ ExprNodeDesc exprNodeDesc = conf.getPredicate();
+ setFilterPredicateCol(exprNodeDesc);
+ conf.setPredicate(exprNodeDesc);
+ return null;
+ }
+ }
+
+
+ /**
+ * This method is recursively called whenever we have our expression node descriptor to be an instance of the ExprNodeGenericFuncDesc.
+ * We exit the recursion when we find an instance of ExprNodeColumnDesc and set its column name to internal name
+ * @param exprNodeDesc
+ */
+ private static void setFilterPredicateCol(ExprNodeDesc exprNodeDesc){
+ if(exprNodeDesc instanceof ExprNodeColumnDesc){
+ ExprNodeColumnDesc encd = (ExprNodeColumnDesc)exprNodeDesc;
+ String col = encd.getColumn();
+ if(subqueryCtx.getSelectColumnNames().contains(col)){
+ encd.setColumn(subqueryCtx.getAliasToInternal().get(col));
+ }
+ exprNodeDesc = encd;
+ }else if(exprNodeDesc instanceof ExprNodeGenericFuncDesc){
+ ExprNodeGenericFuncDesc engfd = (ExprNodeGenericFuncDesc)exprNodeDesc;
+ List colExprs = engfd.getChildExprs();
+ for (ExprNodeDesc colExpr : colExprs) {
+ //continue until you find an instance of the ExprNodeColumnDesc
+ setFilterPredicateCol(colExpr);
+ }
+ }
+
+ }
+
+
+ public static NewQueryFilterSchemaProc getNewQueryFilterSchemaProc(){
+ return new NewQueryFilterSchemaProc();
+ }
+
+
+}
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/RewriteParseContextGenerator.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/RewriteParseContextGenerator.java
new file mode 100644
index 0000000..3d97a4a
--- /dev/null
+++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/RewriteParseContextGenerator.java
@@ -0,0 +1,100 @@
+package org.apache.hadoop.hive.ql.optimizer;
+
+import java.io.IOException;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.hive.conf.HiveConf;
+import org.apache.hadoop.hive.ql.Context;
+import org.apache.hadoop.hive.ql.parse.ASTNode;
+import org.apache.hadoop.hive.ql.parse.BaseSemanticAnalyzer;
+import org.apache.hadoop.hive.ql.parse.ParseContext;
+import org.apache.hadoop.hive.ql.parse.ParseDriver;
+import org.apache.hadoop.hive.ql.parse.ParseException;
+import org.apache.hadoop.hive.ql.parse.ParseUtils;
+import org.apache.hadoop.hive.ql.parse.QB;
+import org.apache.hadoop.hive.ql.parse.SemanticAnalyzer;
+import org.apache.hadoop.hive.ql.parse.SemanticAnalyzerFactory;
+import org.apache.hadoop.hive.ql.parse.SemanticException;
+
+/**
+ * RewriteParseContextGenerator is a class that offers methods to generate operator tree
+ * for input queries. It is implemented on lines of the analyzeInternal(..) method
+ * of {@link SemanticAnalyzer} but it creates only the ParseContext for the input query command.
+ * It does not optimize or generate map-reduce tasks for the input query.
+ * This can be used when you need to create operator tree for an internal query.
+ * For example, {@link RewriteGBUsingIndex} uses the {@link RewriteIndexSubqueryProcFactory} methods to
+ * generate subquery that scans over index table rather than original table.
+ *
+ */
+public final class RewriteParseContextGenerator {
+ protected static Log LOG = LogFactory.getLog(RewriteParseContextGenerator.class.getName());
+
+ /**
+ * Parse the input {@link String} command and generate a ASTNode tree
+ * @param conf
+ * @param command
+ * @return
+ */
+ public static ParseContext generateOperatorTree(HiveConf conf, String command){
+ Context ctx;
+ ParseContext subPCtx = null;
+ try {
+ ctx = new Context(conf);
+ ParseDriver pd = new ParseDriver();
+ ASTNode tree = pd.parse(command, ctx);
+ tree = ParseUtils.findRootNonNullToken(tree);
+
+ BaseSemanticAnalyzer sem = SemanticAnalyzerFactory.get(conf, tree);
+ doSemanticAnalysis(sem, tree, ctx);
+
+ subPCtx = ((SemanticAnalyzer) sem).getParseContext();
+ LOG.info("Sub-query Semantic Analysis Completed");
+ } catch (IOException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ } catch (ParseException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ } catch (SemanticException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ return subPCtx;
+
+ }
+
+ /**
+ * For the input ASTNode tree, perform a semantic analysis and check metadata
+ * Generate a operator tree and return the {@link ParseContext} instance for the operator tree
+ *
+ * @param ctx
+ * @param sem
+ * @param ast
+ * @return
+ * @throws SemanticException
+ */
+ private static void doSemanticAnalysis(BaseSemanticAnalyzer sem, ASTNode ast, Context ctx) throws SemanticException {
+
+ if(sem instanceof SemanticAnalyzer){
+ QB qb = new QB(null, null, false);
+ ASTNode child = ast;
+ ParseContext subPCtx = ((SemanticAnalyzer) sem).getParseContext();
+ subPCtx.setContext(ctx);
+ ((SemanticAnalyzer) sem).init(subPCtx);
+
+ LOG.info("Starting Sub-query Semantic Analysis");
+ ((SemanticAnalyzer) sem).doPhase1(child, qb, ((SemanticAnalyzer) sem).initPhase1Ctx());
+ LOG.info("Completed phase 1 of Sub-query Semantic Analysis");
+
+ ((SemanticAnalyzer) sem).getMetaData(qb);
+ LOG.info("Completed getting MetaData in Sub-query Semantic Analysis");
+
+ LOG.info("Sub-query Abstract syntax tree: " + ast.toStringTree());
+ ((SemanticAnalyzer) sem).genPlan(qb);
+
+ LOG.info("Sub-query Completed plan generation");
+ }
+ }
+
+}
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/RewriteRemoveGroupbyCtx.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/RewriteRemoveGroupbyCtx.java
new file mode 100644
index 0000000..8662a41
--- /dev/null
+++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/RewriteRemoveGroupbyCtx.java
@@ -0,0 +1,222 @@
+package org.apache.hadoop.hive.ql.optimizer;
+
+import java.io.Serializable;
+import java.util.ArrayList;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Stack;
+
+import org.apache.hadoop.hive.ql.exec.Operator;
+import org.apache.hadoop.hive.ql.exec.TableScanOperator;
+import org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher;
+import org.apache.hadoop.hive.ql.lib.Dispatcher;
+import org.apache.hadoop.hive.ql.lib.GraphWalker;
+import org.apache.hadoop.hive.ql.lib.Node;
+import org.apache.hadoop.hive.ql.lib.NodeProcessor;
+import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx;
+import org.apache.hadoop.hive.ql.lib.PreOrderWalker;
+import org.apache.hadoop.hive.ql.lib.Rule;
+import org.apache.hadoop.hive.ql.lib.RuleRegExp;
+import org.apache.hadoop.hive.ql.metadata.Hive;
+import org.apache.hadoop.hive.ql.parse.ASTNode;
+import org.apache.hadoop.hive.ql.parse.HiveParser;
+import org.apache.hadoop.hive.ql.parse.OpParseContext;
+import org.apache.hadoop.hive.ql.parse.ParseContext;
+import org.apache.hadoop.hive.ql.parse.SemanticException;
+
+/**
+ * RewriteRemoveGroupbyCtx class stores the context for the {@link RewriteRemoveGroupbyProcFactory} processor factory methods
+ */
+public class RewriteRemoveGroupbyCtx implements NodeProcessorCtx {
+
+ private RewriteRemoveGroupbyCtx(ParseContext parseContext, Hive hiveDb, String indexTableName){
+ //this prevents the class from getting instantiated
+ this.parseContext = parseContext;
+ this.hiveDb = hiveDb;
+ this.indexName = indexTableName;
+ this.opc = parseContext.getOpParseCtx();
+ }
+
+ public static RewriteRemoveGroupbyCtx getInstance(ParseContext parseContext, Hive hiveDb, String indexTableName){
+ return new RewriteRemoveGroupbyCtx(parseContext, hiveDb, indexTableName);
+ }
+
+ //We need these two ArrayLists to reset the parent operator list and child operator list in the operator tree
+ // once we remove the operators that represent the group-by construct
+ private final List> newParentList = new ArrayList>();
+ private final List> newChildrenList = new ArrayList>();
+
+ //We need to remove the operators from OpParseContext to remove them from the operator tree
+ private LinkedHashMap, OpParseContext> opc = new LinkedHashMap, OpParseContext>();
+ private final Hive hiveDb;
+ private final ParseContext parseContext;
+
+ //We need the RewriteCanApplyCtx instance to retrieve the mapping from original table to index table in the
+ // getReplaceTableScanProc() method of the RewriteRemoveGroupbyProcFactory
+ //private RewriteCanApplyCtx canApplyCtx;
+ private final String indexName;
+
+ public List> getNewParentList() {
+ return newParentList;
+ }
+
+ public List> getNewChildrenList() {
+ return newChildrenList;
+ }
+
+ public LinkedHashMap, OpParseContext> getOpc() {
+ return opc;
+ }
+
+ public ParseContext getParseContext() {
+ return parseContext;
+ }
+
+ public Hive getHiveDb() {
+ return hiveDb;
+ }
+
+ public String getIndexName() {
+ return indexName;
+ }
+
+ /**
+ * Given a root node of the parse tree, this function returns the "first" TOK_FUNCTION node
+ * that matches the input function name
+ *
+ * @param root
+ * @return
+ */
+ ASTNode getFuncNode(ASTNode root, String funcName){
+ ASTNode func = null;
+ ArrayList cList = root.getChildren();
+ while(cList != null && cList.size() > 0){
+ for (Node node : cList) {
+ if(null != node){
+ ASTNode curr = (ASTNode)node;
+ if(curr.getType() == HiveParser.TOK_FUNCTION){
+ ArrayList funcChildren = curr.getChildren();
+ for (Node child : funcChildren) {
+ ASTNode funcChild = (ASTNode)child;
+ if(funcChild.getText().equals(funcName)){
+ func = curr;
+ cList = null;
+ break;
+ }
+ }
+ }else{
+ cList = curr.getChildren();
+ continue;
+ }
+ }
+ }
+ }
+ return func;
+ }
+
+
+ /**
+ * Given an input operator, this function returns the top TableScanOperator for the operator tree
+ * @param inputOp
+ * @return
+ */
+ Operator extends Serializable> getTopOperator(Operator extends Serializable> inputOp){
+ Operator extends Serializable> tsOp = null;
+ List> parentList = inputOp.getParentOperators();
+ while(parentList != null && parentList.size() > 0){
+ for (Operator extends Serializable> op : parentList) {
+ if(op != null){
+ if(op instanceof TableScanOperator){
+ tsOp = (TableScanOperator) op;
+ parentList = null;
+ break;
+ }else{
+ parentList = op.getParentOperators();
+ continue;
+ }
+ }
+ }
+ }
+
+ return tsOp;
+ }
+
+
+ /**
+ * Walk the original operator tree using the {@link PreOrderWalker} using the rules.
+ * Each of the rules invoke respective methods from the {@link RewriteRemoveGroupbyProcFactory}
+ * to remove the group-by constructs from the original query and replace the original
+ * {@link TableScanOperator} with the new index table scan operator.
+ *
+ * @param topOp
+ * @throws SemanticException
+ */
+ public void invokeRemoveGbyProc(Operator extends Serializable> topOp) throws SemanticException{
+ Map opRules = new LinkedHashMap();
+
+ // replace scan operator containing original table with index table
+ opRules.put(new RuleRegExp("R1", "TS%"), RewriteRemoveGroupbyProcFactory.getReplaceTableScanProc());
+ //rule that replaces index key selection with size(_offsets) function in original query
+ opRules.put(new RuleRegExp("R2", "SEL%"), RewriteRemoveGroupbyProcFactory.getReplaceIdxKeyWithSizeFuncProc());
+ // remove group-by pattern from original operator tree
+ opRules.put(new RuleRegExp("R3", "GBY%RS%GBY%"), RewriteRemoveGroupbyProcFactory.getRemoveGroupByProc());
+
+ // The dispatcher fires the processor corresponding to the closest matching
+ // rule and passes the context along
+ Dispatcher disp = new DefaultRuleDispatcher(getDefaultProc(), opRules, this);
+ GraphWalker ogw = new PreOrderWalker(disp);
+
+ // Create a list of topop nodes
+ ArrayList topNodes = new ArrayList();
+ topNodes.add(topOp);
+ ogw.startWalking(topNodes, null);
+
+ }
+
+
+ /**
+ * Walk the original operator tree using the {@link PreOrderWalker} using the rules.
+ * Each of the rules invoke respective methods from the {@link RewriteRemoveGroupbyProcFactory}
+ * to replace the original {@link TableScanOperator} with the new index table scan operator.
+ *
+ * @param topOp
+ * @throws SemanticException
+ */
+ public void invokeReplaceTableScanProc(Operator extends Serializable> topOp) throws SemanticException{
+ Map opRules = new LinkedHashMap();
+
+ // replace scan operator containing original table with index table
+ opRules.put(new RuleRegExp("R1", "TS%"), RewriteRemoveGroupbyProcFactory.getReplaceTableScanProc());
+
+ // The dispatcher fires the processor corresponding to the closest matching
+ // rule and passes the context along
+ Dispatcher disp = new DefaultRuleDispatcher(getDefaultProc(), opRules, this);
+ GraphWalker ogw = new PreOrderWalker(disp);
+
+ // Create a list of topop nodes
+ ArrayList topNodes = new ArrayList();
+ topNodes.add(topOp);
+ ogw.startWalking(topNodes, null);
+
+ }
+
+ /**
+ * Default procedure for {@link DefaultRuleDispatcher}
+ * @return
+ */
+ private NodeProcessor getDefaultProc() {
+ return new NodeProcessor() {
+ @Override
+ public Object process(Node nd, Stack stack,
+ NodeProcessorCtx procCtx, Object... nodeOutputs) throws SemanticException {
+ return null;
+ }
+ };
+ }
+
+
+
+
+}
+
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/RewriteRemoveGroupbyProcFactory.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/RewriteRemoveGroupbyProcFactory.java
new file mode 100644
index 0000000..1a04855
--- /dev/null
+++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/RewriteRemoveGroupbyProcFactory.java
@@ -0,0 +1,339 @@
+package org.apache.hadoop.hive.ql.optimizer;
+
+import java.io.IOException;
+import java.io.Serializable;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.Stack;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.conf.HiveConf;
+import org.apache.hadoop.hive.ql.Context;
+import org.apache.hadoop.hive.ql.exec.ColumnInfo;
+import org.apache.hadoop.hive.ql.exec.GroupByOperator;
+import org.apache.hadoop.hive.ql.exec.Operator;
+import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator;
+import org.apache.hadoop.hive.ql.exec.RowSchema;
+import org.apache.hadoop.hive.ql.exec.SelectOperator;
+import org.apache.hadoop.hive.ql.exec.TableScanOperator;
+import org.apache.hadoop.hive.ql.lib.Node;
+import org.apache.hadoop.hive.ql.lib.NodeProcessor;
+import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx;
+import org.apache.hadoop.hive.ql.metadata.Table;
+import org.apache.hadoop.hive.ql.parse.ASTNode;
+import org.apache.hadoop.hive.ql.parse.BaseSemanticAnalyzer;
+import org.apache.hadoop.hive.ql.parse.OpParseContext;
+import org.apache.hadoop.hive.ql.parse.ParseDriver;
+import org.apache.hadoop.hive.ql.parse.ParseException;
+import org.apache.hadoop.hive.ql.parse.ParseUtils;
+import org.apache.hadoop.hive.ql.parse.RowResolver;
+import org.apache.hadoop.hive.ql.parse.SemanticAnalyzer;
+import org.apache.hadoop.hive.ql.parse.SemanticAnalyzerFactory;
+import org.apache.hadoop.hive.ql.parse.SemanticException;
+import org.apache.hadoop.hive.ql.parse.BaseSemanticAnalyzer.tableSpec;
+import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc;
+import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
+import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc;
+import org.apache.hadoop.hive.ql.plan.SelectDesc;
+import org.apache.hadoop.hive.ql.plan.TableScanDesc;
+import org.apache.hadoop.hive.serde2.SerDeException;
+import org.apache.hadoop.hive.serde2.objectinspector.StructField;
+import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
+import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;
+
+/**
+ * Factory of processors used by {@link RewriteGBUsingIndex} (see invokeRemoveGbyProc(..) method)
+ * Each of the processors are invoked according to a rule and serve towards removing
+ * group-by construct from original operator tree
+ *
+ */
+public final class RewriteRemoveGroupbyProcFactory {
+ protected final static Log LOG = LogFactory.getLog(RewriteRemoveGroupbyProcFactory.class.getName());
+ private static RewriteRemoveGroupbyCtx removeGbyCtx = null;
+
+ private RewriteRemoveGroupbyProcFactory() {
+ //this prevents the class from getting instantiated
+ }
+
+ /**
+ * This processor removes the SelectOperator whose child is a GroupByOperator from the operator tree (OpParseContext).
+ * When we remove the group-by construct from the query, we do not need this SelectOperator which worked initially as an
+ * interim operator to pass arguments from the parent TableScanOperator to the child GroupByOperator (Remember that the genPlan(..)
+ * method creates the operators bottom-up FROM-WHERE-GROUPBY-ORDER-BY-SELECT etc)
+ *
+ * Since we need to remove the group-by construct (comprising of GBY-RS-GBY operators and interim SEL operator), the processor sets the
+ * appropriate parent-child links.
+ *
+ * The processor also constructs a ExprNodeDesc instance for the size(_offsets) function and replaces the index key columns
+ * with this function descriptor. It also sets the rowSchema, colList and colExprMap data structures correctly for this SelectOperator
+ * to accommodate the new replacement and removal of group-by construct
+ *
+ */
+ private static class ReplaceIdxKeyWithSizeFunc implements NodeProcessor {
+ public Object process(Node nd, Stack stack, NodeProcessorCtx ctx,
+ Object... nodeOutputs) throws SemanticException {
+ SelectOperator operator = (SelectOperator)nd;
+ removeGbyCtx = (RewriteRemoveGroupbyCtx)ctx;
+
+ //as of now, we have hard-coded the positions as get(0) etc as whenever a group-by construct appears in teh operator tree,
+ //it comes in the SEL-GBY-RS-SEL combination. This lets us presume that the parent or child operator will always be
+ // at the 0th position in the DAG operator tree
+ List> childrenList = operator.getChildOperators();
+ Operator extends Serializable> child = childrenList.get(0);
+ Operator extends Serializable> parent = operator.getParentOperators().get(0);
+
+ if(child instanceof GroupByOperator){
+ //this is the interim SEL operator for the group-by construct, we do not need this in the re-written operator tree
+ removeGbyCtx.getNewParentList().addAll(operator.getParentOperators());
+ removeGbyCtx.getOpc().remove(operator);
+ }else if(parent instanceof GroupByOperator){
+
+ // set the child operator list of interim SEL's parent operator to be the child operator list of the GroupByOperator
+ removeGbyCtx.getNewParentList().get(0).setChildOperators(removeGbyCtx.getNewChildrenList());
+ // set the parent operator list for the SelectOperator (whose parent operator is GroupByOperator)
+ //to be the parent list of interim SEL operator
+ removeGbyCtx.getNewChildrenList().get(0).setParentOperators(removeGbyCtx.getNewParentList());
+
+ //This code parses the string command and constructs a ASTNode parse tree
+ //we need this to construct the ExprNodeDesc for the size(_offsets) function
+ HiveConf conf = removeGbyCtx.getParseContext().getConf();
+ Context context = null;
+ ASTNode tree = null;
+ BaseSemanticAnalyzer sem = null;
+ String newSelCommand = "select size(`_offsets`) from " + removeGbyCtx.getIndexName();
+ try {
+ context = new Context(conf);
+ ParseDriver pd = new ParseDriver();
+ tree = pd.parse(newSelCommand, context);
+ tree = ParseUtils.findRootNonNullToken(tree);
+ sem = SemanticAnalyzerFactory.get(conf, tree);
+
+ } catch (ParseException e) {
+ LOG.info("ParseException in ReplaceIdxKeyWithSizeFunc");
+ e.printStackTrace();
+ } catch (SemanticException e) {
+ LOG.info("SemanticException in ReplaceIdxKeyWithSizeFunc");
+ e.printStackTrace();
+ } catch (IOException e) {
+ LOG.info("IOException in ReplaceIdxKeyWithSizeFunc");
+ e.printStackTrace();
+ }
+
+ //We retrieve the ASTNode function token from the root tree
+ ASTNode funcNode = removeGbyCtx.getFuncNode(tree, "size");
+
+ //We need the rowResolver of the parent TableScanOperator to fix the rowSchema, colList, colExprMap of the SelectOperator
+ //and also to construct the ExprNodeDesc to replace the index key columns with size(_offsets) GenericUDF
+ LinkedHashMap, OpParseContext> opCtxMap =
+ removeGbyCtx.getParseContext().getOpParseCtx();
+ Operator extends Serializable> tsOp = removeGbyCtx.getTopOperator(operator);
+ OpParseContext tsCtx = opCtxMap.get(tsOp);
+ ExprNodeDesc exprNode = ((SemanticAnalyzer) sem).genExprNodeDesc(funcNode, tsCtx.getRowResolver());
+
+ //We need the name of the GenericUDF function to correct the rowSchema
+ String funcName = "";
+
+ if(exprNode instanceof ExprNodeGenericFuncDesc){
+ List exprList = ((ExprNodeGenericFuncDesc) exprNode).getChildExprs();
+ for (ExprNodeDesc exprNodeDesc : exprList) {
+ if(exprNodeDesc instanceof ExprNodeColumnDesc){
+ funcName = ((ExprNodeColumnDesc) exprNodeDesc).getColumn();
+ }
+ }
+ }
+
+ SelectDesc selDesc = (SelectDesc) operator.getConf();
+ //Since we have removed the interim SEL operator when we removed the group-by construct, we need to get rid
+ //of the internal names in the colList and colExprMap of this SelectOperator
+ //internalToAlias map gives us this mapping to correct these data structures
+ HashMap internalToAlias = new LinkedHashMap();
+
+ //Set the new RowSchema and populate the internalToAlias map
+ RowSchema rs = operator.getSchema();
+ ArrayList newRS = new ArrayList();
+ ArrayList sign = rs.getSignature();
+ for (ColumnInfo columnInfo : sign) {
+ String alias = columnInfo.getAlias();
+ String internalName = columnInfo.getInternalName();
+ internalToAlias.put(internalName, alias);
+ //the function name always has alias starting with _c (for eg. _c1 etc)
+ //We need to set the new alias (_offsets) for the initial "_c1" in rowSchema
+ if(alias != null && alias.startsWith("_c")){
+ columnInfo.setAlias(funcName);
+ }
+ newRS.add(columnInfo);
+ }
+ operator.getSchema().setSignature(newRS);
+
+ //Set the colList of this SelectOperator
+ ArrayList colList = selDesc.getColList();
+ int i = 0;
+ for (; i< colList.size(); i++) {
+ ExprNodeDesc exprNodeDesc = colList.get(i);
+ if(exprNodeDesc instanceof ExprNodeColumnDesc){
+ String internal = ((ExprNodeColumnDesc)exprNodeDesc).getColumn();
+ //get rid of the internal column names like _col0, _col1 and replace them with their actual names i.e. alias
+ if(internalToAlias.get(internal) != null){
+ ((ExprNodeColumnDesc) exprNodeDesc).setColumn(internalToAlias.get(internal));
+ }
+ //however, if the alias itself is the internal name of the function argument, say _c1, we need to replace the
+ //ExprNodeColumnDesc instance with the ExprNodeGenericFuncDesc (i.e. exprNode here)
+ //this replaces the count(literal) or count(index_key) function with size(_offsets)
+ if(((ExprNodeColumnDesc) exprNodeDesc).getColumn().startsWith("_c")){
+ colList.set(i, exprNode);
+ }
+ }
+ }
+
+ selDesc.setColList(colList);
+
+ //Set the new colExprMap for this SelectOperator
+ Map origColExprMap = operator.getColumnExprMap();
+ Map newColExprMap = new LinkedHashMap();
+ Set internalNamesList = origColExprMap.keySet();
+ for (String internal : internalNamesList) {
+ ExprNodeDesc end = origColExprMap.get(internal).clone();
+ if(end instanceof ExprNodeColumnDesc){
+ //get rid of the internal column names like _col0, _col1 and replace them with their actual names i.e. alias
+ if(internalToAlias.get(internal) != null){
+ ((ExprNodeColumnDesc) end).setColumn(internalToAlias.get(internal));
+ }
+ //this replaces the count(literal) or count(index_key) function with size(_offsets)
+ if(((ExprNodeColumnDesc) end).getColumn().startsWith("_c")){
+ newColExprMap.put(internal, exprNode);
+ }else{
+ newColExprMap.put(internal, end);
+ }
+ }else{
+ newColExprMap.put(internal, end);
+ }
+ }
+ operator.setColumnExprMap(newColExprMap);
+ }
+ return null;
+ }
+ }
+
+ public static ReplaceIdxKeyWithSizeFunc getReplaceIdxKeyWithSizeFuncProc(){
+ return new ReplaceIdxKeyWithSizeFunc();
+ }
+
+
+ /**
+ * This processor replaces the original TableScanOperator with the new TableScanOperator and metadata that scans over the
+ * index table rather than scanning over the orginal table.
+ *
+ */
+ private static class RepaceTableScanOpProc implements NodeProcessor {
+ public Object process(Node nd, Stack stack, NodeProcessorCtx ctx,
+ Object... nodeOutputs) throws SemanticException {
+ TableScanOperator scanOperator = (TableScanOperator)nd;
+ removeGbyCtx = (RewriteRemoveGroupbyCtx)ctx;
+
+ HashMap topToTable =
+ removeGbyCtx.getParseContext().getTopToTable();
+
+ //Check if we have a valid index on the original base table for the replacement
+/* String baseTableName = topToTable.get(scanOperator).getTableName();
+ if( removeGbyCtx.getCanApplyCtx().findBaseTable(baseTableName) == null ) {
+ LOG.debug("No mapping found for original table and index table name");
+ }
+*/
+ //construct a new descriptor for the index table scan
+ TableScanDesc indexTableScanDesc = new TableScanDesc();
+ indexTableScanDesc.setGatherStats(false);
+
+ //String tableName = removeGbyCtx.getCanApplyCtx().findBaseTable(baseTableName);
+ String tableName = removeGbyCtx.getIndexName();
+
+ tableSpec ts = new tableSpec(removeGbyCtx.getHiveDb(),
+ removeGbyCtx.getParseContext().getConf(),
+ tableName
+ );
+ String k = tableName + Path.SEPARATOR;
+ indexTableScanDesc.setStatsAggPrefix(k);
+ scanOperator.setConf(indexTableScanDesc);
+
+ //remove original TableScanOperator
+ topToTable.clear();
+ removeGbyCtx.getParseContext().getTopOps().clear();
+
+ //Scan operator now points to other table
+ scanOperator.setAlias(tableName);
+ topToTable.put(scanOperator, ts.tableHandle);
+ removeGbyCtx.getParseContext().setTopToTable(topToTable);
+
+ OpParseContext operatorContext =
+ removeGbyCtx.getParseContext().getOpParseCtx().get(scanOperator);
+ RowResolver rr = new RowResolver();
+ removeGbyCtx.getParseContext().getOpParseCtx().remove(scanOperator);
+
+
+ //Construct the new RowResolver for the new TableScanOperator
+ try {
+ StructObjectInspector rowObjectInspector = (StructObjectInspector) ts.tableHandle.getDeserializer().getObjectInspector();
+ List extends StructField> fields = rowObjectInspector
+ .getAllStructFieldRefs();
+ for (int i = 0; i < fields.size(); i++) {
+ rr.put(tableName, fields.get(i).getFieldName(), new ColumnInfo(fields
+ .get(i).getFieldName(), TypeInfoUtils
+ .getTypeInfoFromObjectInspector(fields.get(i)
+ .getFieldObjectInspector()), tableName, false));
+ }
+ } catch (SerDeException e) {
+ throw new RuntimeException(e);
+ }
+ //Set row resolver for new table
+ operatorContext.setRowResolver(rr);
+
+ //Put the new TableScanOperator in the OpParseContext and topOps maps of the original ParseContext
+ removeGbyCtx.getParseContext().getOpParseCtx().put(scanOperator, operatorContext);
+ removeGbyCtx.getParseContext().getTopOps().put(tableName, scanOperator);
+ return null;
+ }
+ }
+
+ public static RepaceTableScanOpProc getReplaceTableScanProc(){
+ return new RepaceTableScanOpProc();
+ }
+
+ /**
+ * This processor removes the GroupBy operators and the interim ReduceSinkOperator from the OpParseContext
+ *
+ */
+ private static class RemoveGBYProc implements NodeProcessor {
+ public Object process(Node nd, Stack stack, NodeProcessorCtx ctx,
+ Object... nodeOutputs) throws SemanticException {
+ GroupByOperator operator = (GroupByOperator)nd;
+ removeGbyCtx = (RewriteRemoveGroupbyCtx)ctx;
+ //On walking the operator tree using the rule 'GBY-RS-GBY', we get the GroupByOperator that is not in the 'groupOpToInputTables'
+ //map in the ParseContext. Hence the check.
+ if(!removeGbyCtx.getParseContext().getGroupOpToInputTables().containsKey(operator)){
+ removeGbyCtx.getNewChildrenList().addAll(operator.getChildOperators());
+
+ ReduceSinkOperator rsOp = (ReduceSinkOperator) operator.getParentOperators().get(0);
+ removeGbyCtx.getOpc().remove(rsOp);
+
+ GroupByOperator gbyOp = (GroupByOperator) rsOp.getParentOperators().get(0);
+ //we need to remove this GBY operator from the groupOpToInputTables map from ParseContext as well
+ removeGbyCtx.getParseContext().getGroupOpToInputTables().remove(gbyOp);
+ removeGbyCtx.getOpc().remove(gbyOp);
+
+ }
+
+ return null;
+ }
+ }
+
+ public static RemoveGBYProc getRemoveGroupByProc(){
+ return new RemoveGBYProc();
+ }
+
+
+}
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/BaseSemanticAnalyzer.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/BaseSemanticAnalyzer.java
index d8442b2..fd012a5 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/parse/BaseSemanticAnalyzer.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/BaseSemanticAnalyzer.java
@@ -71,7 +71,7 @@ public abstract class BaseSemanticAnalyzer {
protected Context ctx;
protected HashMap idToTableNameMap;
-
+
public static int HIVE_COLUMN_ORDER_ASC = 1;
public static int HIVE_COLUMN_ORDER_DESC = 0;
@@ -583,6 +583,22 @@ public abstract class BaseSemanticAnalyzer {
public static enum SpecType {TABLE_ONLY, STATIC_PARTITION, DYNAMIC_PARTITION};
public SpecType specType;
+ public tableSpec(Hive db, HiveConf conf, String tableName) throws SemanticException {
+ this.tableName = tableName;
+
+ try {
+ this.tableHandle = db.getTable(tableName);
+ } catch (HiveException e) {
+ //XTODO: Throw semantic exception here
+ throw new SemanticException(ErrorMsg.GENERIC_ERROR.getMsg(tableName), e);
+ }
+ this.specType = SpecType.TABLE_ONLY;
+
+ }
+ private Table getTable(String tableName2) {
+ // TODO Auto-generated method stub
+ return null;
+ }
public tableSpec(Hive db, HiveConf conf, ASTNode ast)
throws SemanticException {
@@ -719,7 +735,7 @@ public abstract class BaseSemanticAnalyzer {
}
return partSpec;
}
-
+
public Hive getDb() {
return db;
}
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java
index 15e7a13..cc0ee20 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java
@@ -27,9 +27,9 @@ import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
-import java.util.Map.Entry;
import java.util.Set;
import java.util.TreeSet;
+import java.util.Map.Entry;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
@@ -91,7 +91,6 @@ import org.apache.hadoop.hive.ql.metadata.VirtualColumn;
import org.apache.hadoop.hive.ql.optimizer.GenMRFileSink1;
import org.apache.hadoop.hive.ql.optimizer.GenMROperator;
import org.apache.hadoop.hive.ql.optimizer.GenMRProcContext;
-import org.apache.hadoop.hive.ql.optimizer.GenMRProcContext.GenMapRedCtx;
import org.apache.hadoop.hive.ql.optimizer.GenMRRedSink1;
import org.apache.hadoop.hive.ql.optimizer.GenMRRedSink2;
import org.apache.hadoop.hive.ql.optimizer.GenMRRedSink3;
@@ -101,6 +100,7 @@ import org.apache.hadoop.hive.ql.optimizer.GenMRUnion1;
import org.apache.hadoop.hive.ql.optimizer.GenMapRedUtils;
import org.apache.hadoop.hive.ql.optimizer.MapJoinFactory;
import org.apache.hadoop.hive.ql.optimizer.Optimizer;
+import org.apache.hadoop.hive.ql.optimizer.GenMRProcContext.GenMapRedCtx;
import org.apache.hadoop.hive.ql.optimizer.physical.PhysicalContext;
import org.apache.hadoop.hive.ql.optimizer.physical.PhysicalOptimizer;
import org.apache.hadoop.hive.ql.optimizer.ppr.PartitionPruner;
@@ -121,7 +121,6 @@ import org.apache.hadoop.hive.ql.plan.ExtractDesc;
import org.apache.hadoop.hive.ql.plan.FetchWork;
import org.apache.hadoop.hive.ql.plan.FileSinkDesc;
import org.apache.hadoop.hive.ql.plan.FilterDesc;
-import org.apache.hadoop.hive.ql.plan.FilterDesc.sampleDesc;
import org.apache.hadoop.hive.ql.plan.ForwardDesc;
import org.apache.hadoop.hive.ql.plan.GroupByDesc;
import org.apache.hadoop.hive.ql.plan.HiveOperation;
@@ -144,12 +143,13 @@ import org.apache.hadoop.hive.ql.plan.TableDesc;
import org.apache.hadoop.hive.ql.plan.TableScanDesc;
import org.apache.hadoop.hive.ql.plan.UDTFDesc;
import org.apache.hadoop.hive.ql.plan.UnionDesc;
+import org.apache.hadoop.hive.ql.plan.FilterDesc.sampleDesc;
import org.apache.hadoop.hive.ql.session.SessionState;
import org.apache.hadoop.hive.ql.session.SessionState.ResourceType;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator;
-import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator.Mode;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFHash;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator.Mode;
import org.apache.hadoop.hive.serde.Constants;
import org.apache.hadoop.hive.serde2.Deserializer;
import org.apache.hadoop.hive.serde2.MetadataTypedColumnsetSerDe;
@@ -157,9 +157,9 @@ import org.apache.hadoop.hive.serde2.SerDeException;
import org.apache.hadoop.hive.serde2.SerDeUtils;
import org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
-import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category;
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;
@@ -7391,4 +7391,4 @@ public class SemanticAnalyzer extends BaseSemanticAnalyzer {
return conf.getIntVar(HiveConf.ConfVars.HADOOPNUMREDUCERS);
}
-}
+}
\ No newline at end of file
diff --git a/ql/src/test/org/apache/hadoop/hive/ql/QTestUtil.java b/ql/src/test/org/apache/hadoop/hive/ql/QTestUtil.java
index 6a5eec3..a4dc28d 100644
--- a/ql/src/test/org/apache/hadoop/hive/ql/QTestUtil.java
+++ b/ql/src/test/org/apache/hadoop/hive/ql/QTestUtil.java
@@ -330,7 +330,17 @@ public class QTestUtil {
db.setCurrentDatabase(dbName);
for (String tblName : db.getAllTables()) {
if (!DEFAULT_DATABASE_NAME.equals(dbName) || !srcTables.contains(tblName)) {
- db.dropTable(dbName, tblName);
+ Table table = db.getTable(dbName, tblName, false);
+ if (MetaStoreUtils.isIndexTable(table.getTTable())) {
+ // Skip the index type table here.
+ // XTODO: Assuming (but verify)
+ // - Drop table automatically drops indexes on that table too.
+ // - No other case results into dangling indexes i.e. where indexes are
+ // left behind but orig (base) table no longer exists.
+ }
+ else {
+ db.dropTable(dbName, tblName);
+ }
}
}
if (!DEFAULT_DATABASE_NAME.equals(dbName)) {
diff --git a/ql/src/test/queries/clientnegative/fatal.q b/ql/src/test/queries/clientnegative/fatal.q
new file mode 100644
index 0000000..367e0fc
--- /dev/null
+++ b/ql/src/test/queries/clientnegative/fatal.q
@@ -0,0 +1,4 @@
+set hive.mapjoin.maxsize=1;
+set hive.task.progress=true;
+
+select /*+ mapjoin(b) */ * from src a join src b on (a.key=b.key);
diff --git a/ql/src/test/queries/clientpositive/ql_rewrite_gbtoidx.q b/ql/src/test/queries/clientpositive/ql_rewrite_gbtoidx.q
new file mode 100644
index 0000000..cccd1ec
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/ql_rewrite_gbtoidx.q
@@ -0,0 +1,162 @@
+
+DROP TABLE lineitem;
+CREATE TABLE lineitem (L_ORDERKEY INT,
+ L_PARTKEY INT,
+ L_SUPPKEY INT,
+ L_LINENUMBER INT,
+ L_QUANTITY DOUBLE,
+ L_EXTENDEDPRICE DOUBLE,
+ L_DISCOUNT DOUBLE,
+ L_TAX DOUBLE,
+ L_RETURNFLAG STRING,
+ L_LINESTATUS STRING,
+ l_shipdate STRING,
+ L_COMMITDATE STRING,
+ L_RECEIPTDATE STRING,
+ L_SHIPINSTRUCT STRING,
+ L_SHIPMODE STRING,
+ L_COMMENT STRING)
+ROW FORMAT DELIMITED
+FIELDS TERMINATED BY '|';
+
+CREATE INDEX lineitem_lshipdate_idx ON TABLE lineitem(l_shipdate) AS 'org.apache.hadoop.hive.ql.index.compact.CompactIndexHandler' WITH DEFERRED REBUILD;
+ALTER INDEX lineitem_lshipdate_idx ON lineitem REBUILD;
+
+set hive.optimize.gbyusingindex=true;
+
+explain select l_shipdate,
+ count(1)
+from
+lineitem
+group by l_shipdate;
+
+
+explain select lastyear.month,
+ thisyear.month,
+ (thisyear.monthly_shipments - lastyear.monthly_shipments) /
+lastyear.monthly_shipments as monthly_shipments_delta
+ from (select year(l_shipdate) as year,
+ month(l_shipdate) as month,
+ count(1) as monthly_shipments
+ from lineitem
+ where year(l_shipdate) = 1997
+ group by year(l_shipdate), month(l_shipdate)
+ ) lastyear join
+ (select year(l_shipdate) as year,
+ month(l_shipdate) as month,
+ count(1) as monthly_shipments
+ from lineitem
+ where year(l_shipdate) = 1998
+ group by year(l_shipdate), month(l_shipdate)
+ ) thisyear
+ on lastyear.month = thisyear.month;
+
+
+
+explain select year(l_shipdate) as year,
+ month(l_shipdate) as month,
+ count(1) as monthly_shipments
+from lineitem
+group by year(l_shipdate), month(l_shipdate);
+
+
+
+explain select year(l_shipdate) as year,
+ month(l_shipdate) as month,
+ sum(sz)
+from (
+select l_shipdate, size(`_offsets`) as sz
+from default__lineitem_lineitem_lshipdate_idx__
+) t
+group by year(l_shipdate), month(l_shipdate);
+
+
+explain select year(l_shipdate) as year,
+ month(l_shipdate) as month,
+ count(1) as monthly_shipments
+from lineitem
+group by year(l_shipdate), month(l_shipdate);
+
+
+
+
+
+
+
+
+
+
+
+explain select year(l_shipdate) as year,
+ month(l_shipdate) as month,
+ sum(sz)
+from (
+select l_shipdate, size(`_offsets`) as sz
+from default__lineitem_lineitem_lshipdate_idx__
+) t
+group by year(l_shipdate), month(l_shipdate);
+
+
+explain select year(L_SHIPDATE), month(L_SHIPDATE) as month_bkt, COUNT(1)
+ from lineitem
+group by year(L_SHIPDATE), month(L_SHIPDATE);
+
+
+explain select lastyear.month,
+ thisyear.month,
+ (thisyear.monthly_shipments - lastyear.monthly_shipments) /
+lastyear.monthly_shipments as monthly_shipments_delta
+ from (select year(l_shipdate) as year,
+ month(l_shipdate) as month,
+ count(1) as monthly_shipments
+ from lineitem
+ where year(l_shipdate) = 1997
+ group by year(l_shipdate), month(l_shipdate)
+ ) lastyear join
+ (select year(l_shipdate) as year,
+ month(l_shipdate) as month,
+ count(1) as monthly_shipments
+ from lineitem
+ where year(l_shipdate) = 1998
+ group by year(l_shipdate), month(l_shipdate)
+ ) thisyear
+ on lastyear.month = thisyear.month
+ and lastyear.year = thisyear.year;
+
+
+DROP TABLE tbl;
+CREATE TABLE tbl(key int, value int);
+CREATE INDEX tbl_key_idx ON TABLE tbl(key) AS 'org.apache.hadoop.hive.ql.index.compact.CompactIndexHandler' WITH DEFERRED REBUILD;
+ALTER INDEX tbl_key_idx ON tbl REBUILD;
+set hive.optimize.gbyusingindex=true;
+EXPLAIN select key, count(key) from tbl where key = 1 group by key;
+EXPLAIN SELECT DISTINCT key FROM tbl;
+EXPLAIN select count(1) from tbl;
+EXPLAIN select key, count(key) from tbl group by key;
+EXPLAIN select count(key) from tbl;
+EXPLAIN SELECT DISTINCT key FROM tbl;
+EXPLAIN SELECT key FROM tbl GROUP BY key;
+
+EXPLAIN SELECT DISTINCT key FROM tbl;
+EXPLAIN SELECT DISTINCT key, value FROM tbl;
+
+EXPLAIN SELECT key FROM tbl GROUP BY key;
+EXPLAIN SELECT key FROM tbl GROUP BY value, key;
+EXPLAIN SELECT key, value FROM tbl GROUP BY value, key;
+
+EXPLAIN SELECT DISTINCT key, value FROM tbl WHERE value = 2;
+EXPLAIN SELECT DISTINCT key, value FROM tbl WHERE value = 2 AND key = 3;
+EXPLAIN SELECT DISTINCT key, value FROM tbl WHERE value = key;
+
+
+EXPLAIN SELECT key FROM tbl WHERE key = 3 GROUP BY key;
+EXPLAIN SELECT key, value FROM tbl WHERE value = 1 GROUP BY key, value;
+
+EXPLAIN SELECT * FROM (SELECT DISTINCT key, value FROM tbl) v1 WHERE v1.value = 2;
+
+EXPLAIN SELECT key FROM tbl WHERE value = 2 GROUP BY key;
+EXPLAIN SELECT DISTINCT key, substr(value,2,3) FROM tbl WHERE value = key;
+EXPLAIN SELECT DISTINCT key, substr(value,2,3) FROM tbl;
+EXPLAIN SELECT key FROM tbl GROUP BY key, substr(key,2,3);
+
+DROP TABLE tbl;
diff --git a/ql/src/test/results/clientpositive/ql_rewrite_gbtoidx.q.out b/ql/src/test/results/clientpositive/ql_rewrite_gbtoidx.q.out
new file mode 100644
index 0000000..0efd042
--- /dev/null
+++ b/ql/src/test/results/clientpositive/ql_rewrite_gbtoidx.q.out
@@ -0,0 +1,2743 @@
+PREHOOK: query: DROP TABLE lineitem
+PREHOOK: type: DROPTABLE
+POSTHOOK: query: DROP TABLE lineitem
+POSTHOOK: type: DROPTABLE
+PREHOOK: query: CREATE TABLE lineitem (L_ORDERKEY INT,
+ L_PARTKEY INT,
+ L_SUPPKEY INT,
+ L_LINENUMBER INT,
+ L_QUANTITY DOUBLE,
+ L_EXTENDEDPRICE DOUBLE,
+ L_DISCOUNT DOUBLE,
+ L_TAX DOUBLE,
+ L_RETURNFLAG STRING,
+ L_LINESTATUS STRING,
+ l_shipdate STRING,
+ L_COMMITDATE STRING,
+ L_RECEIPTDATE STRING,
+ L_SHIPINSTRUCT STRING,
+ L_SHIPMODE STRING,
+ L_COMMENT STRING)
+ROW FORMAT DELIMITED
+FIELDS TERMINATED BY '|'
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: CREATE TABLE lineitem (L_ORDERKEY INT,
+ L_PARTKEY INT,
+ L_SUPPKEY INT,
+ L_LINENUMBER INT,
+ L_QUANTITY DOUBLE,
+ L_EXTENDEDPRICE DOUBLE,
+ L_DISCOUNT DOUBLE,
+ L_TAX DOUBLE,
+ L_RETURNFLAG STRING,
+ L_LINESTATUS STRING,
+ l_shipdate STRING,
+ L_COMMITDATE STRING,
+ L_RECEIPTDATE STRING,
+ L_SHIPINSTRUCT STRING,
+ L_SHIPMODE STRING,
+ L_COMMENT STRING)
+ROW FORMAT DELIMITED
+FIELDS TERMINATED BY '|'
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@lineitem
+PREHOOK: query: CREATE INDEX lineitem_lshipdate_idx ON TABLE lineitem(l_shipdate) AS 'org.apache.hadoop.hive.ql.index.compact.CompactIndexHandler' WITH DEFERRED REBUILD
+PREHOOK: type: CREATEINDEX
+POSTHOOK: query: CREATE INDEX lineitem_lshipdate_idx ON TABLE lineitem(l_shipdate) AS 'org.apache.hadoop.hive.ql.index.compact.CompactIndexHandler' WITH DEFERRED REBUILD
+POSTHOOK: type: CREATEINDEX
+PREHOOK: query: ALTER INDEX lineitem_lshipdate_idx ON lineitem REBUILD
+PREHOOK: type: QUERY
+PREHOOK: Input: default@lineitem
+PREHOOK: Output: default@default__lineitem_lineitem_lshipdate_idx__
+POSTHOOK: query: ALTER INDEX lineitem_lshipdate_idx ON lineitem REBUILD
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@lineitem
+POSTHOOK: Output: default@default__lineitem_lineitem_lshipdate_idx__
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ]
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ]
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ]
+PREHOOK: query: explain select l_shipdate,
+ count(1)
+from
+lineitem
+group by l_shipdate
+PREHOOK: type: QUERY
+POSTHOOK: query: explain select l_shipdate,
+ count(1)
+from
+lineitem
+group by l_shipdate
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ]
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ]
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_TABREF lineitem)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL l_shipdate)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL l_shipdate))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ default__lineitem_lineitem_lshipdate_idx__
+ TableScan
+ Select Operator
+ expressions:
+ expr: l_shipdate
+ type: string
+ expr: size(_offsets)
+ type: int
+ outputColumnNames: _col0, _col1
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: explain select lastyear.month,
+ thisyear.month,
+ (thisyear.monthly_shipments - lastyear.monthly_shipments) /
+lastyear.monthly_shipments as monthly_shipments_delta
+ from (select year(l_shipdate) as year,
+ month(l_shipdate) as month,
+ count(1) as monthly_shipments
+ from lineitem
+ where year(l_shipdate) = 1997
+ group by year(l_shipdate), month(l_shipdate)
+ ) lastyear join
+ (select year(l_shipdate) as year,
+ month(l_shipdate) as month,
+ count(1) as monthly_shipments
+ from lineitem
+ where year(l_shipdate) = 1998
+ group by year(l_shipdate), month(l_shipdate)
+ ) thisyear
+ on lastyear.month = thisyear.month
+PREHOOK: type: QUERY
+POSTHOOK: query: explain select lastyear.month,
+ thisyear.month,
+ (thisyear.monthly_shipments - lastyear.monthly_shipments) /
+lastyear.monthly_shipments as monthly_shipments_delta
+ from (select year(l_shipdate) as year,
+ month(l_shipdate) as month,
+ count(1) as monthly_shipments
+ from lineitem
+ where year(l_shipdate) = 1997
+ group by year(l_shipdate), month(l_shipdate)
+ ) lastyear join
+ (select year(l_shipdate) as year,
+ month(l_shipdate) as month,
+ count(1) as monthly_shipments
+ from lineitem
+ where year(l_shipdate) = 1998
+ group by year(l_shipdate), month(l_shipdate)
+ ) thisyear
+ on lastyear.month = thisyear.month
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ]
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ]
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF lineitem)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTION year (TOK_TABLE_OR_COL l_shipdate)) year) (TOK_SELEXPR (TOK_FUNCTION month (TOK_TABLE_OR_COL l_shipdate)) month) (TOK_SELEXPR (TOK_FUNCTION count 1) monthly_shipments)) (TOK_WHERE (= (TOK_FUNCTION year (TOK_TABLE_OR_COL l_shipdate)) 1997)) (TOK_GROUPBY (TOK_FUNCTION year (TOK_TABLE_OR_COL l_shipdate)) (TOK_FUNCTION month (TOK_TABLE_OR_COL l_shipdate))))) lastyear) (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF lineitem)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTION year (TOK_TABLE_OR_COL l_shipdate)) year) (TOK_SELEXPR (TOK_FUNCTION month (TOK_TABLE_OR_COL l_shipdate)) month) (TOK_SELEXPR (TOK_FUNCTION count 1) monthly_shipments)) (TOK_WHERE (= (TOK_FUNCTION year (TOK_TABLE_OR_COL l_shipdate)) 1998)) (TOK_GROUPBY (TOK_FUNCTION year (TOK_TABLE_OR_COL l_shipdate)) (TOK_FUNCTION month (TOK_TABLE_OR_COL l_shipdate))))) thisyear) (= (. (TOK_TABLE_OR_COL lastyear) month) (. (TOK_TABLE_OR_COL thisyear) month)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL lastyear) month)) (TOK_SELEXPR (. (TOK_TABLE_OR_COL thisyear) month)) (TOK_SELEXPR (/ (- (. (TOK_TABLE_OR_COL thisyear) monthly_shipments) (. (TOK_TABLE_OR_COL lastyear) monthly_shipments)) (. (TOK_TABLE_OR_COL lastyear) monthly_shipments)) monthly_shipments_delta))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-2 depends on stages: Stage-1, Stage-3
+ Stage-3 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ lastyear:default__lineitem_lineitem_lshipdate_idx__
+ TableScan
+ alias: default__lineitem_lineitem_lshipdate_idx__
+ Filter Operator
+ predicate:
+ expr: (year(l_shipdate) = 1997)
+ type: boolean
+ Select Operator
+ expressions:
+ expr: l_shipdate
+ type: string
+ expr: size(_offsets)
+ type: int
+ outputColumnNames: _col0, _col1
+ Filter Operator
+ predicate:
+ expr: (year(_col0) = 1997)
+ type: boolean
+ Select Operator
+ expressions:
+ expr: _col1
+ type: int
+ expr: _col0
+ type: string
+ outputColumnNames: _col0, _col1
+ Group By Operator
+ aggregations:
+ expr: sum(_col1)
+ bucketGroup: false
+ keys:
+ expr: year(_col0)
+ type: int
+ expr: month(_col0)
+ type: int
+ mode: hash
+ outputColumnNames: _col0, _col1, _col2
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: int
+ expr: _col1
+ type: int
+ sort order: ++
+ Map-reduce partition columns:
+ expr: _col0
+ type: int
+ expr: _col1
+ type: int
+ tag: -1
+ value expressions:
+ expr: _col2
+ type: bigint
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: sum(VALUE._col0)
+ bucketGroup: false
+ keys:
+ expr: KEY._col0
+ type: int
+ expr: KEY._col1
+ type: int
+ mode: mergepartial
+ outputColumnNames: _col0, _col1, _col2
+ Select Operator
+ expressions:
+ expr: _col1
+ type: int
+ expr: _col2
+ type: bigint
+ outputColumnNames: _col1, _col2
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-2
+ Map Reduce
+ Alias -> Map Operator Tree:
+ $INTNAME
+ Reduce Output Operator
+ key expressions:
+ expr: _col1
+ type: int
+ sort order: +
+ Map-reduce partition columns:
+ expr: _col1
+ type: int
+ tag: 0
+ value expressions:
+ expr: _col1
+ type: int
+ expr: _col2
+ type: bigint
+ $INTNAME1
+ Reduce Output Operator
+ key expressions:
+ expr: _col1
+ type: int
+ sort order: +
+ Map-reduce partition columns:
+ expr: _col1
+ type: int
+ tag: 1
+ value expressions:
+ expr: _col1
+ type: int
+ expr: _col2
+ type: bigint
+ Reduce Operator Tree:
+ Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {VALUE._col1} {VALUE._col2}
+ 1 {VALUE._col1} {VALUE._col2}
+ handleSkewJoin: false
+ outputColumnNames: _col1, _col2, _col4, _col5
+ Select Operator
+ expressions:
+ expr: _col1
+ type: int
+ expr: _col4
+ type: int
+ expr: ((_col5 - _col2) / _col2)
+ type: double
+ outputColumnNames: _col0, _col1, _col2
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-3
+ Map Reduce
+ Alias -> Map Operator Tree:
+ thisyear:default__lineitem_lineitem_lshipdate_idx__
+ TableScan
+ alias: default__lineitem_lineitem_lshipdate_idx__
+ Filter Operator
+ predicate:
+ expr: (year(l_shipdate) = 1998)
+ type: boolean
+ Select Operator
+ expressions:
+ expr: l_shipdate
+ type: string
+ expr: size(_offsets)
+ type: int
+ outputColumnNames: _col0, _col1
+ Filter Operator
+ predicate:
+ expr: (year(_col0) = 1998)
+ type: boolean
+ Select Operator
+ expressions:
+ expr: _col1
+ type: int
+ expr: _col0
+ type: string
+ outputColumnNames: _col0, _col1
+ Group By Operator
+ aggregations:
+ expr: sum(_col1)
+ bucketGroup: false
+ keys:
+ expr: year(_col0)
+ type: int
+ expr: month(_col0)
+ type: int
+ mode: hash
+ outputColumnNames: _col0, _col1, _col2
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: int
+ expr: _col1
+ type: int
+ sort order: ++
+ Map-reduce partition columns:
+ expr: _col0
+ type: int
+ expr: _col1
+ type: int
+ tag: -1
+ value expressions:
+ expr: _col2
+ type: bigint
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: sum(VALUE._col0)
+ bucketGroup: false
+ keys:
+ expr: KEY._col0
+ type: int
+ expr: KEY._col1
+ type: int
+ mode: mergepartial
+ outputColumnNames: _col0, _col1, _col2
+ Select Operator
+ expressions:
+ expr: _col1
+ type: int
+ expr: _col2
+ type: bigint
+ outputColumnNames: _col1, _col2
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: explain select year(l_shipdate) as year,
+ month(l_shipdate) as month,
+ count(1) as monthly_shipments
+from lineitem
+group by year(l_shipdate), month(l_shipdate)
+PREHOOK: type: QUERY
+POSTHOOK: query: explain select year(l_shipdate) as year,
+ month(l_shipdate) as month,
+ count(1) as monthly_shipments
+from lineitem
+group by year(l_shipdate), month(l_shipdate)
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ]
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ]
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_TABREF lineitem)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTION year (TOK_TABLE_OR_COL l_shipdate)) year) (TOK_SELEXPR (TOK_FUNCTION month (TOK_TABLE_OR_COL l_shipdate)) month) (TOK_SELEXPR (TOK_FUNCTION count 1) monthly_shipments)) (TOK_GROUPBY (TOK_FUNCTION year (TOK_TABLE_OR_COL l_shipdate)) (TOK_FUNCTION month (TOK_TABLE_OR_COL l_shipdate)))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ default__lineitem_lineitem_lshipdate_idx__
+ TableScan
+ alias: default__lineitem_lineitem_lshipdate_idx__
+ Select Operator
+ expressions:
+ expr: l_shipdate
+ type: string
+ expr: size(_offsets)
+ type: int
+ outputColumnNames: _col0, _col1
+ Select Operator
+ expressions:
+ expr: _col1
+ type: int
+ expr: _col0
+ type: string
+ outputColumnNames: _col0, _col1
+ Group By Operator
+ aggregations:
+ expr: sum(_col1)
+ bucketGroup: false
+ keys:
+ expr: year(_col0)
+ type: int
+ expr: month(_col0)
+ type: int
+ mode: hash
+ outputColumnNames: _col0, _col1, _col2
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: int
+ expr: _col1
+ type: int
+ sort order: ++
+ Map-reduce partition columns:
+ expr: _col0
+ type: int
+ expr: _col1
+ type: int
+ tag: -1
+ value expressions:
+ expr: _col2
+ type: bigint
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: sum(VALUE._col0)
+ bucketGroup: false
+ keys:
+ expr: KEY._col0
+ type: int
+ expr: KEY._col1
+ type: int
+ mode: mergepartial
+ outputColumnNames: _col0, _col1, _col2
+ Select Operator
+ expressions:
+ expr: _col0
+ type: int
+ expr: _col1
+ type: int
+ expr: _col2
+ type: bigint
+ outputColumnNames: _col0, _col1, _col2
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: explain select year(l_shipdate) as year,
+ month(l_shipdate) as month,
+ sum(sz)
+from (
+select l_shipdate, size(`_offsets`) as sz
+from default__lineitem_lineitem_lshipdate_idx__
+) t
+group by year(l_shipdate), month(l_shipdate)
+PREHOOK: type: QUERY
+POSTHOOK: query: explain select year(l_shipdate) as year,
+ month(l_shipdate) as month,
+ sum(sz)
+from (
+select l_shipdate, size(`_offsets`) as sz
+from default__lineitem_lineitem_lshipdate_idx__
+) t
+group by year(l_shipdate), month(l_shipdate)
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ]
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ]
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF default__lineitem_lineitem_lshipdate_idx__)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL l_shipdate)) (TOK_SELEXPR (TOK_FUNCTION size (TOK_TABLE_OR_COL `_offsets`)) sz)))) t)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTION year (TOK_TABLE_OR_COL l_shipdate)) year) (TOK_SELEXPR (TOK_FUNCTION month (TOK_TABLE_OR_COL l_shipdate)) month) (TOK_SELEXPR (TOK_FUNCTION sum (TOK_TABLE_OR_COL sz)))) (TOK_GROUPBY (TOK_FUNCTION year (TOK_TABLE_OR_COL l_shipdate)) (TOK_FUNCTION month (TOK_TABLE_OR_COL l_shipdate)))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ t:default__lineitem_lineitem_lshipdate_idx__
+ TableScan
+ alias: default__lineitem_lineitem_lshipdate_idx__
+ Select Operator
+ expressions:
+ expr: l_shipdate
+ type: string
+ expr: size(_offsets)
+ type: int
+ outputColumnNames: _col0, _col1
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: int
+ outputColumnNames: _col0, _col1
+ Group By Operator
+ aggregations:
+ expr: sum(_col1)
+ bucketGroup: false
+ keys:
+ expr: year(_col0)
+ type: int
+ expr: month(_col0)
+ type: int
+ mode: hash
+ outputColumnNames: _col0, _col1, _col2
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: int
+ expr: _col1
+ type: int
+ sort order: ++
+ Map-reduce partition columns:
+ expr: _col0
+ type: int
+ expr: _col1
+ type: int
+ tag: -1
+ value expressions:
+ expr: _col2
+ type: bigint
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: sum(VALUE._col0)
+ bucketGroup: false
+ keys:
+ expr: KEY._col0
+ type: int
+ expr: KEY._col1
+ type: int
+ mode: mergepartial
+ outputColumnNames: _col0, _col1, _col2
+ Select Operator
+ expressions:
+ expr: _col0
+ type: int
+ expr: _col1
+ type: int
+ expr: _col2
+ type: bigint
+ outputColumnNames: _col0, _col1, _col2
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: explain select year(l_shipdate) as year,
+ month(l_shipdate) as month,
+ count(1) as monthly_shipments
+from lineitem
+group by year(l_shipdate), month(l_shipdate)
+PREHOOK: type: QUERY
+POSTHOOK: query: explain select year(l_shipdate) as year,
+ month(l_shipdate) as month,
+ count(1) as monthly_shipments
+from lineitem
+group by year(l_shipdate), month(l_shipdate)
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ]
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ]
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_TABREF lineitem)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTION year (TOK_TABLE_OR_COL l_shipdate)) year) (TOK_SELEXPR (TOK_FUNCTION month (TOK_TABLE_OR_COL l_shipdate)) month) (TOK_SELEXPR (TOK_FUNCTION count 1) monthly_shipments)) (TOK_GROUPBY (TOK_FUNCTION year (TOK_TABLE_OR_COL l_shipdate)) (TOK_FUNCTION month (TOK_TABLE_OR_COL l_shipdate)))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ default__lineitem_lineitem_lshipdate_idx__
+ TableScan
+ alias: default__lineitem_lineitem_lshipdate_idx__
+ Select Operator
+ expressions:
+ expr: l_shipdate
+ type: string
+ expr: size(_offsets)
+ type: int
+ outputColumnNames: _col0, _col1
+ Select Operator
+ expressions:
+ expr: _col1
+ type: int
+ expr: _col0
+ type: string
+ outputColumnNames: _col0, _col1
+ Group By Operator
+ aggregations:
+ expr: sum(_col1)
+ bucketGroup: false
+ keys:
+ expr: year(_col0)
+ type: int
+ expr: month(_col0)
+ type: int
+ mode: hash
+ outputColumnNames: _col0, _col1, _col2
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: int
+ expr: _col1
+ type: int
+ sort order: ++
+ Map-reduce partition columns:
+ expr: _col0
+ type: int
+ expr: _col1
+ type: int
+ tag: -1
+ value expressions:
+ expr: _col2
+ type: bigint
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: sum(VALUE._col0)
+ bucketGroup: false
+ keys:
+ expr: KEY._col0
+ type: int
+ expr: KEY._col1
+ type: int
+ mode: mergepartial
+ outputColumnNames: _col0, _col1, _col2
+ Select Operator
+ expressions:
+ expr: _col0
+ type: int
+ expr: _col1
+ type: int
+ expr: _col2
+ type: bigint
+ outputColumnNames: _col0, _col1, _col2
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: explain select year(l_shipdate) as year,
+ month(l_shipdate) as month,
+ sum(sz)
+from (
+select l_shipdate, size(`_offsets`) as sz
+from default__lineitem_lineitem_lshipdate_idx__
+) t
+group by year(l_shipdate), month(l_shipdate)
+PREHOOK: type: QUERY
+POSTHOOK: query: explain select year(l_shipdate) as year,
+ month(l_shipdate) as month,
+ sum(sz)
+from (
+select l_shipdate, size(`_offsets`) as sz
+from default__lineitem_lineitem_lshipdate_idx__
+) t
+group by year(l_shipdate), month(l_shipdate)
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ]
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ]
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF default__lineitem_lineitem_lshipdate_idx__)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL l_shipdate)) (TOK_SELEXPR (TOK_FUNCTION size (TOK_TABLE_OR_COL `_offsets`)) sz)))) t)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTION year (TOK_TABLE_OR_COL l_shipdate)) year) (TOK_SELEXPR (TOK_FUNCTION month (TOK_TABLE_OR_COL l_shipdate)) month) (TOK_SELEXPR (TOK_FUNCTION sum (TOK_TABLE_OR_COL sz)))) (TOK_GROUPBY (TOK_FUNCTION year (TOK_TABLE_OR_COL l_shipdate)) (TOK_FUNCTION month (TOK_TABLE_OR_COL l_shipdate)))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ t:default__lineitem_lineitem_lshipdate_idx__
+ TableScan
+ alias: default__lineitem_lineitem_lshipdate_idx__
+ Select Operator
+ expressions:
+ expr: l_shipdate
+ type: string
+ expr: size(_offsets)
+ type: int
+ outputColumnNames: _col0, _col1
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: int
+ outputColumnNames: _col0, _col1
+ Group By Operator
+ aggregations:
+ expr: sum(_col1)
+ bucketGroup: false
+ keys:
+ expr: year(_col0)
+ type: int
+ expr: month(_col0)
+ type: int
+ mode: hash
+ outputColumnNames: _col0, _col1, _col2
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: int
+ expr: _col1
+ type: int
+ sort order: ++
+ Map-reduce partition columns:
+ expr: _col0
+ type: int
+ expr: _col1
+ type: int
+ tag: -1
+ value expressions:
+ expr: _col2
+ type: bigint
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: sum(VALUE._col0)
+ bucketGroup: false
+ keys:
+ expr: KEY._col0
+ type: int
+ expr: KEY._col1
+ type: int
+ mode: mergepartial
+ outputColumnNames: _col0, _col1, _col2
+ Select Operator
+ expressions:
+ expr: _col0
+ type: int
+ expr: _col1
+ type: int
+ expr: _col2
+ type: bigint
+ outputColumnNames: _col0, _col1, _col2
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: explain select year(L_SHIPDATE), month(L_SHIPDATE) as month_bkt, COUNT(1)
+ from lineitem
+group by year(L_SHIPDATE), month(L_SHIPDATE)
+PREHOOK: type: QUERY
+POSTHOOK: query: explain select year(L_SHIPDATE), month(L_SHIPDATE) as month_bkt, COUNT(1)
+ from lineitem
+group by year(L_SHIPDATE), month(L_SHIPDATE)
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ]
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ]
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_TABREF lineitem)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTION year (TOK_TABLE_OR_COL L_SHIPDATE))) (TOK_SELEXPR (TOK_FUNCTION month (TOK_TABLE_OR_COL L_SHIPDATE)) month_bkt) (TOK_SELEXPR (TOK_FUNCTION COUNT 1))) (TOK_GROUPBY (TOK_FUNCTION year (TOK_TABLE_OR_COL L_SHIPDATE)) (TOK_FUNCTION month (TOK_TABLE_OR_COL L_SHIPDATE)))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ default__lineitem_lineitem_lshipdate_idx__
+ TableScan
+ alias: default__lineitem_lineitem_lshipdate_idx__
+ Select Operator
+ expressions:
+ expr: l_shipdate
+ type: string
+ expr: size(_offsets)
+ type: int
+ outputColumnNames: _col0, _col1
+ Select Operator
+ expressions:
+ expr: _col1
+ type: int
+ expr: _col0
+ type: string
+ outputColumnNames: _col0, _col1
+ Group By Operator
+ aggregations:
+ expr: sum(_col1)
+ bucketGroup: false
+ keys:
+ expr: year(_col0)
+ type: int
+ expr: month(_col0)
+ type: int
+ mode: hash
+ outputColumnNames: _col0, _col1, _col2
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: int
+ expr: _col1
+ type: int
+ sort order: ++
+ Map-reduce partition columns:
+ expr: _col0
+ type: int
+ expr: _col1
+ type: int
+ tag: -1
+ value expressions:
+ expr: _col2
+ type: bigint
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: sum(VALUE._col0)
+ bucketGroup: false
+ keys:
+ expr: KEY._col0
+ type: int
+ expr: KEY._col1
+ type: int
+ mode: mergepartial
+ outputColumnNames: _col0, _col1, _col2
+ Select Operator
+ expressions:
+ expr: _col0
+ type: int
+ expr: _col1
+ type: int
+ expr: _col2
+ type: bigint
+ outputColumnNames: _col0, _col1, _col2
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: explain select lastyear.month,
+ thisyear.month,
+ (thisyear.monthly_shipments - lastyear.monthly_shipments) /
+lastyear.monthly_shipments as monthly_shipments_delta
+ from (select year(l_shipdate) as year,
+ month(l_shipdate) as month,
+ count(1) as monthly_shipments
+ from lineitem
+ where year(l_shipdate) = 1997
+ group by year(l_shipdate), month(l_shipdate)
+ ) lastyear join
+ (select year(l_shipdate) as year,
+ month(l_shipdate) as month,
+ count(1) as monthly_shipments
+ from lineitem
+ where year(l_shipdate) = 1998
+ group by year(l_shipdate), month(l_shipdate)
+ ) thisyear
+ on lastyear.month = thisyear.month
+ and lastyear.year = thisyear.year
+PREHOOK: type: QUERY
+POSTHOOK: query: explain select lastyear.month,
+ thisyear.month,
+ (thisyear.monthly_shipments - lastyear.monthly_shipments) /
+lastyear.monthly_shipments as monthly_shipments_delta
+ from (select year(l_shipdate) as year,
+ month(l_shipdate) as month,
+ count(1) as monthly_shipments
+ from lineitem
+ where year(l_shipdate) = 1997
+ group by year(l_shipdate), month(l_shipdate)
+ ) lastyear join
+ (select year(l_shipdate) as year,
+ month(l_shipdate) as month,
+ count(1) as monthly_shipments
+ from lineitem
+ where year(l_shipdate) = 1998
+ group by year(l_shipdate), month(l_shipdate)
+ ) thisyear
+ on lastyear.month = thisyear.month
+ and lastyear.year = thisyear.year
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ]
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ]
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF lineitem)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTION year (TOK_TABLE_OR_COL l_shipdate)) year) (TOK_SELEXPR (TOK_FUNCTION month (TOK_TABLE_OR_COL l_shipdate)) month) (TOK_SELEXPR (TOK_FUNCTION count 1) monthly_shipments)) (TOK_WHERE (= (TOK_FUNCTION year (TOK_TABLE_OR_COL l_shipdate)) 1997)) (TOK_GROUPBY (TOK_FUNCTION year (TOK_TABLE_OR_COL l_shipdate)) (TOK_FUNCTION month (TOK_TABLE_OR_COL l_shipdate))))) lastyear) (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF lineitem)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTION year (TOK_TABLE_OR_COL l_shipdate)) year) (TOK_SELEXPR (TOK_FUNCTION month (TOK_TABLE_OR_COL l_shipdate)) month) (TOK_SELEXPR (TOK_FUNCTION count 1) monthly_shipments)) (TOK_WHERE (= (TOK_FUNCTION year (TOK_TABLE_OR_COL l_shipdate)) 1998)) (TOK_GROUPBY (TOK_FUNCTION year (TOK_TABLE_OR_COL l_shipdate)) (TOK_FUNCTION month (TOK_TABLE_OR_COL l_shipdate))))) thisyear) (and (= (. (TOK_TABLE_OR_COL lastyear) month) (. (TOK_TABLE_OR_COL thisyear) month)) (= (. (TOK_TABLE_OR_COL lastyear) year) (. (TOK_TABLE_OR_COL thisyear) year))))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL lastyear) month)) (TOK_SELEXPR (. (TOK_TABLE_OR_COL thisyear) month)) (TOK_SELEXPR (/ (- (. (TOK_TABLE_OR_COL thisyear) monthly_shipments) (. (TOK_TABLE_OR_COL lastyear) monthly_shipments)) (. (TOK_TABLE_OR_COL lastyear) monthly_shipments)) monthly_shipments_delta))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-2 depends on stages: Stage-1, Stage-3
+ Stage-3 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ lastyear:default__lineitem_lineitem_lshipdate_idx__
+ TableScan
+ alias: default__lineitem_lineitem_lshipdate_idx__
+ Filter Operator
+ predicate:
+ expr: (year(l_shipdate) = 1997)
+ type: boolean
+ Select Operator
+ expressions:
+ expr: l_shipdate
+ type: string
+ expr: size(_offsets)
+ type: int
+ outputColumnNames: _col0, _col1
+ Filter Operator
+ predicate:
+ expr: (year(_col0) = 1997)
+ type: boolean
+ Select Operator
+ expressions:
+ expr: _col1
+ type: int
+ expr: _col0
+ type: string
+ outputColumnNames: _col0, _col1
+ Group By Operator
+ aggregations:
+ expr: sum(_col1)
+ bucketGroup: false
+ keys:
+ expr: year(_col0)
+ type: int
+ expr: month(_col0)
+ type: int
+ mode: hash
+ outputColumnNames: _col0, _col1, _col2
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: int
+ expr: _col1
+ type: int
+ sort order: ++
+ Map-reduce partition columns:
+ expr: _col0
+ type: int
+ expr: _col1
+ type: int
+ tag: -1
+ value expressions:
+ expr: _col2
+ type: bigint
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: sum(VALUE._col0)
+ bucketGroup: false
+ keys:
+ expr: KEY._col0
+ type: int
+ expr: KEY._col1
+ type: int
+ mode: mergepartial
+ outputColumnNames: _col0, _col1, _col2
+ Select Operator
+ expressions:
+ expr: _col0
+ type: int
+ expr: _col1
+ type: int
+ expr: _col2
+ type: bigint
+ outputColumnNames: _col0, _col1, _col2
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-2
+ Map Reduce
+ Alias -> Map Operator Tree:
+ $INTNAME
+ Reduce Output Operator
+ key expressions:
+ expr: _col1
+ type: int
+ expr: _col0
+ type: int
+ sort order: ++
+ Map-reduce partition columns:
+ expr: _col1
+ type: int
+ expr: _col0
+ type: int
+ tag: 0
+ value expressions:
+ expr: _col1
+ type: int
+ expr: _col2
+ type: bigint
+ $INTNAME1
+ Reduce Output Operator
+ key expressions:
+ expr: _col1
+ type: int
+ expr: _col0
+ type: int
+ sort order: ++
+ Map-reduce partition columns:
+ expr: _col1
+ type: int
+ expr: _col0
+ type: int
+ tag: 1
+ value expressions:
+ expr: _col1
+ type: int
+ expr: _col2
+ type: bigint
+ Reduce Operator Tree:
+ Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {VALUE._col1} {VALUE._col2}
+ 1 {VALUE._col1} {VALUE._col2}
+ handleSkewJoin: false
+ outputColumnNames: _col1, _col2, _col4, _col5
+ Select Operator
+ expressions:
+ expr: _col1
+ type: int
+ expr: _col4
+ type: int
+ expr: ((_col5 - _col2) / _col2)
+ type: double
+ outputColumnNames: _col0, _col1, _col2
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-3
+ Map Reduce
+ Alias -> Map Operator Tree:
+ thisyear:default__lineitem_lineitem_lshipdate_idx__
+ TableScan
+ alias: default__lineitem_lineitem_lshipdate_idx__
+ Filter Operator
+ predicate:
+ expr: (year(l_shipdate) = 1998)
+ type: boolean
+ Select Operator
+ expressions:
+ expr: l_shipdate
+ type: string
+ expr: size(_offsets)
+ type: int
+ outputColumnNames: _col0, _col1
+ Filter Operator
+ predicate:
+ expr: (year(_col0) = 1998)
+ type: boolean
+ Select Operator
+ expressions:
+ expr: _col1
+ type: int
+ expr: _col0
+ type: string
+ outputColumnNames: _col0, _col1
+ Group By Operator
+ aggregations:
+ expr: sum(_col1)
+ bucketGroup: false
+ keys:
+ expr: year(_col0)
+ type: int
+ expr: month(_col0)
+ type: int
+ mode: hash
+ outputColumnNames: _col0, _col1, _col2
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: int
+ expr: _col1
+ type: int
+ sort order: ++
+ Map-reduce partition columns:
+ expr: _col0
+ type: int
+ expr: _col1
+ type: int
+ tag: -1
+ value expressions:
+ expr: _col2
+ type: bigint
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: sum(VALUE._col0)
+ bucketGroup: false
+ keys:
+ expr: KEY._col0
+ type: int
+ expr: KEY._col1
+ type: int
+ mode: mergepartial
+ outputColumnNames: _col0, _col1, _col2
+ Select Operator
+ expressions:
+ expr: _col0
+ type: int
+ expr: _col1
+ type: int
+ expr: _col2
+ type: bigint
+ outputColumnNames: _col0, _col1, _col2
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: DROP TABLE tbl
+PREHOOK: type: DROPTABLE
+POSTHOOK: query: DROP TABLE tbl
+POSTHOOK: type: DROPTABLE
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ]
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ]
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ]
+PREHOOK: query: CREATE TABLE tbl(key int, value int)
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: CREATE TABLE tbl(key int, value int)
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@tbl
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ]
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ]
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ]
+PREHOOK: query: CREATE INDEX tbl_key_idx ON TABLE tbl(key) AS 'org.apache.hadoop.hive.ql.index.compact.CompactIndexHandler' WITH DEFERRED REBUILD
+PREHOOK: type: CREATEINDEX
+POSTHOOK: query: CREATE INDEX tbl_key_idx ON TABLE tbl(key) AS 'org.apache.hadoop.hive.ql.index.compact.CompactIndexHandler' WITH DEFERRED REBUILD
+POSTHOOK: type: CREATEINDEX
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ]
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ]
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ]
+PREHOOK: query: ALTER INDEX tbl_key_idx ON tbl REBUILD
+PREHOOK: type: QUERY
+PREHOOK: Input: default@tbl
+PREHOOK: Output: default@default__tbl_tbl_key_idx__
+POSTHOOK: query: ALTER INDEX tbl_key_idx ON tbl REBUILD
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@tbl
+POSTHOOK: Output: default@default__tbl_tbl_key_idx__
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ]
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ]
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ]
+POSTHOOK: Lineage: default__tbl_tbl_key_idx__._bucketname SIMPLE [(tbl)tbl.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ]
+POSTHOOK: Lineage: default__tbl_tbl_key_idx__._offsets EXPRESSION [(tbl)tbl.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ]
+POSTHOOK: Lineage: default__tbl_tbl_key_idx__.key SIMPLE [(tbl)tbl.FieldSchema(name:key, type:int, comment:null), ]
+PREHOOK: query: EXPLAIN select key, count(key) from tbl where key = 1 group by key
+PREHOOK: type: QUERY
+POSTHOOK: query: EXPLAIN select key, count(key) from tbl where key = 1 group by key
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ]
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ]
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ]
+POSTHOOK: Lineage: default__tbl_tbl_key_idx__._bucketname SIMPLE [(tbl)tbl.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ]
+POSTHOOK: Lineage: default__tbl_tbl_key_idx__._offsets EXPRESSION [(tbl)tbl.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ]
+POSTHOOK: Lineage: default__tbl_tbl_key_idx__.key SIMPLE [(tbl)tbl.FieldSchema(name:key, type:int, comment:null), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_TABREF tbl)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count (TOK_TABLE_OR_COL key)))) (TOK_WHERE (= (TOK_TABLE_OR_COL key) 1)) (TOK_GROUPBY (TOK_TABLE_OR_COL key))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ default__tbl_tbl_key_idx__
+ TableScan
+ Filter Operator
+ predicate:
+ expr: (key = 1)
+ type: boolean
+ Filter Operator
+ predicate:
+ expr: (key = 1)
+ type: boolean
+ Select Operator
+ expressions:
+ expr: key
+ type: int
+ expr: size(_offsets)
+ type: int
+ outputColumnNames: _col0, _col1
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: EXPLAIN SELECT DISTINCT key FROM tbl
+PREHOOK: type: QUERY
+POSTHOOK: query: EXPLAIN SELECT DISTINCT key FROM tbl
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ]
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ]
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ]
+POSTHOOK: Lineage: default__tbl_tbl_key_idx__._bucketname SIMPLE [(tbl)tbl.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ]
+POSTHOOK: Lineage: default__tbl_tbl_key_idx__._offsets EXPRESSION [(tbl)tbl.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ]
+POSTHOOK: Lineage: default__tbl_tbl_key_idx__.key SIMPLE [(tbl)tbl.FieldSchema(name:key, type:int, comment:null), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_TABREF tbl)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECTDI (TOK_SELEXPR (TOK_TABLE_OR_COL key)))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ default__tbl_tbl_key_idx__
+ TableScan
+ Select Operator
+ expressions:
+ expr: key
+ type: int
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: EXPLAIN select count(1) from tbl
+PREHOOK: type: QUERY
+POSTHOOK: query: EXPLAIN select count(1) from tbl
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ]
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ]
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ]
+POSTHOOK: Lineage: default__tbl_tbl_key_idx__._bucketname SIMPLE [(tbl)tbl.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ]
+POSTHOOK: Lineage: default__tbl_tbl_key_idx__._offsets EXPRESSION [(tbl)tbl.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ]
+POSTHOOK: Lineage: default__tbl_tbl_key_idx__.key SIMPLE [(tbl)tbl.FieldSchema(name:key, type:int, comment:null), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_TABREF tbl)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTION count 1)))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ tbl
+ TableScan
+ alias: tbl
+ Select Operator
+ Group By Operator
+ aggregations:
+ expr: count(1)
+ bucketGroup: false
+ mode: hash
+ outputColumnNames: _col0
+ Reduce Output Operator
+ sort order:
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: bigint
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ mode: mergepartial
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: bigint
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: EXPLAIN select key, count(key) from tbl group by key
+PREHOOK: type: QUERY
+POSTHOOK: query: EXPLAIN select key, count(key) from tbl group by key
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ]
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ]
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ]
+POSTHOOK: Lineage: default__tbl_tbl_key_idx__._bucketname SIMPLE [(tbl)tbl.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ]
+POSTHOOK: Lineage: default__tbl_tbl_key_idx__._offsets EXPRESSION [(tbl)tbl.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ]
+POSTHOOK: Lineage: default__tbl_tbl_key_idx__.key SIMPLE [(tbl)tbl.FieldSchema(name:key, type:int, comment:null), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_TABREF tbl)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count (TOK_TABLE_OR_COL key)))) (TOK_GROUPBY (TOK_TABLE_OR_COL key))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ default__tbl_tbl_key_idx__
+ TableScan
+ Select Operator
+ expressions:
+ expr: key
+ type: int
+ expr: size(_offsets)
+ type: int
+ outputColumnNames: _col0, _col1
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: EXPLAIN select count(key) from tbl
+PREHOOK: type: QUERY
+POSTHOOK: query: EXPLAIN select count(key) from tbl
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ]
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ]
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ]
+POSTHOOK: Lineage: default__tbl_tbl_key_idx__._bucketname SIMPLE [(tbl)tbl.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ]
+POSTHOOK: Lineage: default__tbl_tbl_key_idx__._offsets EXPRESSION [(tbl)tbl.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ]
+POSTHOOK: Lineage: default__tbl_tbl_key_idx__.key SIMPLE [(tbl)tbl.FieldSchema(name:key, type:int, comment:null), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_TABREF tbl)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTION count (TOK_TABLE_OR_COL key))))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ tbl
+ TableScan
+ alias: tbl
+ Select Operator
+ expressions:
+ expr: key
+ type: int
+ outputColumnNames: key
+ Group By Operator
+ aggregations:
+ expr: count(key)
+ bucketGroup: false
+ mode: hash
+ outputColumnNames: _col0
+ Reduce Output Operator
+ sort order:
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: bigint
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ mode: mergepartial
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: bigint
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: EXPLAIN SELECT DISTINCT key FROM tbl
+PREHOOK: type: QUERY
+POSTHOOK: query: EXPLAIN SELECT DISTINCT key FROM tbl
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ]
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ]
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ]
+POSTHOOK: Lineage: default__tbl_tbl_key_idx__._bucketname SIMPLE [(tbl)tbl.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ]
+POSTHOOK: Lineage: default__tbl_tbl_key_idx__._offsets EXPRESSION [(tbl)tbl.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ]
+POSTHOOK: Lineage: default__tbl_tbl_key_idx__.key SIMPLE [(tbl)tbl.FieldSchema(name:key, type:int, comment:null), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_TABREF tbl)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECTDI (TOK_SELEXPR (TOK_TABLE_OR_COL key)))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ default__tbl_tbl_key_idx__
+ TableScan
+ Select Operator
+ expressions:
+ expr: key
+ type: int
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: EXPLAIN SELECT key FROM tbl GROUP BY key
+PREHOOK: type: QUERY
+POSTHOOK: query: EXPLAIN SELECT key FROM tbl GROUP BY key
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ]
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ]
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ]
+POSTHOOK: Lineage: default__tbl_tbl_key_idx__._bucketname SIMPLE [(tbl)tbl.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ]
+POSTHOOK: Lineage: default__tbl_tbl_key_idx__._offsets EXPRESSION [(tbl)tbl.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ]
+POSTHOOK: Lineage: default__tbl_tbl_key_idx__.key SIMPLE [(tbl)tbl.FieldSchema(name:key, type:int, comment:null), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_TABREF tbl)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key))) (TOK_GROUPBY (TOK_TABLE_OR_COL key))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ default__tbl_tbl_key_idx__
+ TableScan
+ Select Operator
+ expressions:
+ expr: key
+ type: int
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: EXPLAIN SELECT DISTINCT key FROM tbl
+PREHOOK: type: QUERY
+POSTHOOK: query: EXPLAIN SELECT DISTINCT key FROM tbl
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ]
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ]
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ]
+POSTHOOK: Lineage: default__tbl_tbl_key_idx__._bucketname SIMPLE [(tbl)tbl.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ]
+POSTHOOK: Lineage: default__tbl_tbl_key_idx__._offsets EXPRESSION [(tbl)tbl.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ]
+POSTHOOK: Lineage: default__tbl_tbl_key_idx__.key SIMPLE [(tbl)tbl.FieldSchema(name:key, type:int, comment:null), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_TABREF tbl)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECTDI (TOK_SELEXPR (TOK_TABLE_OR_COL key)))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ default__tbl_tbl_key_idx__
+ TableScan
+ Select Operator
+ expressions:
+ expr: key
+ type: int
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: EXPLAIN SELECT DISTINCT key, value FROM tbl
+PREHOOK: type: QUERY
+POSTHOOK: query: EXPLAIN SELECT DISTINCT key, value FROM tbl
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ]
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ]
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ]
+POSTHOOK: Lineage: default__tbl_tbl_key_idx__._bucketname SIMPLE [(tbl)tbl.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ]
+POSTHOOK: Lineage: default__tbl_tbl_key_idx__._offsets EXPRESSION [(tbl)tbl.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ]
+POSTHOOK: Lineage: default__tbl_tbl_key_idx__.key SIMPLE [(tbl)tbl.FieldSchema(name:key, type:int, comment:null), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_TABREF tbl)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECTDI (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL value)))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ tbl
+ TableScan
+ alias: tbl
+ Select Operator
+ expressions:
+ expr: key
+ type: int
+ expr: value
+ type: int
+ outputColumnNames: key, value
+ Group By Operator
+ bucketGroup: false
+ keys:
+ expr: key
+ type: int
+ expr: value
+ type: int
+ mode: hash
+ outputColumnNames: _col0, _col1
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: int
+ expr: _col1
+ type: int
+ sort order: ++
+ Map-reduce partition columns:
+ expr: _col0
+ type: int
+ expr: _col1
+ type: int
+ tag: -1
+ Reduce Operator Tree:
+ Group By Operator
+ bucketGroup: false
+ keys:
+ expr: KEY._col0
+ type: int
+ expr: KEY._col1
+ type: int
+ mode: mergepartial
+ outputColumnNames: _col0, _col1
+ Select Operator
+ expressions:
+ expr: _col0
+ type: int
+ expr: _col1
+ type: int
+ outputColumnNames: _col0, _col1
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: EXPLAIN SELECT key FROM tbl GROUP BY key
+PREHOOK: type: QUERY
+POSTHOOK: query: EXPLAIN SELECT key FROM tbl GROUP BY key
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ]
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ]
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ]
+POSTHOOK: Lineage: default__tbl_tbl_key_idx__._bucketname SIMPLE [(tbl)tbl.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ]
+POSTHOOK: Lineage: default__tbl_tbl_key_idx__._offsets EXPRESSION [(tbl)tbl.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ]
+POSTHOOK: Lineage: default__tbl_tbl_key_idx__.key SIMPLE [(tbl)tbl.FieldSchema(name:key, type:int, comment:null), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_TABREF tbl)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key))) (TOK_GROUPBY (TOK_TABLE_OR_COL key))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ default__tbl_tbl_key_idx__
+ TableScan
+ Select Operator
+ expressions:
+ expr: key
+ type: int
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: EXPLAIN SELECT key FROM tbl GROUP BY value, key
+PREHOOK: type: QUERY
+POSTHOOK: query: EXPLAIN SELECT key FROM tbl GROUP BY value, key
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ]
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ]
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ]
+POSTHOOK: Lineage: default__tbl_tbl_key_idx__._bucketname SIMPLE [(tbl)tbl.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ]
+POSTHOOK: Lineage: default__tbl_tbl_key_idx__._offsets EXPRESSION [(tbl)tbl.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ]
+POSTHOOK: Lineage: default__tbl_tbl_key_idx__.key SIMPLE [(tbl)tbl.FieldSchema(name:key, type:int, comment:null), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_TABREF tbl)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key))) (TOK_GROUPBY (TOK_TABLE_OR_COL value) (TOK_TABLE_OR_COL key))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ tbl
+ TableScan
+ alias: tbl
+ Select Operator
+ expressions:
+ expr: value
+ type: int
+ expr: key
+ type: int
+ outputColumnNames: value, key
+ Group By Operator
+ bucketGroup: false
+ keys:
+ expr: value
+ type: int
+ expr: key
+ type: int
+ mode: hash
+ outputColumnNames: _col0, _col1
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: int
+ expr: _col1
+ type: int
+ sort order: ++
+ Map-reduce partition columns:
+ expr: _col0
+ type: int
+ expr: _col1
+ type: int
+ tag: -1
+ Reduce Operator Tree:
+ Group By Operator
+ bucketGroup: false
+ keys:
+ expr: KEY._col0
+ type: int
+ expr: KEY._col1
+ type: int
+ mode: mergepartial
+ outputColumnNames: _col0, _col1
+ Select Operator
+ expressions:
+ expr: _col1
+ type: int
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: EXPLAIN SELECT key, value FROM tbl GROUP BY value, key
+PREHOOK: type: QUERY
+POSTHOOK: query: EXPLAIN SELECT key, value FROM tbl GROUP BY value, key
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ]
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ]
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ]
+POSTHOOK: Lineage: default__tbl_tbl_key_idx__._bucketname SIMPLE [(tbl)tbl.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ]
+POSTHOOK: Lineage: default__tbl_tbl_key_idx__._offsets EXPRESSION [(tbl)tbl.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ]
+POSTHOOK: Lineage: default__tbl_tbl_key_idx__.key SIMPLE [(tbl)tbl.FieldSchema(name:key, type:int, comment:null), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_TABREF tbl)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL value))) (TOK_GROUPBY (TOK_TABLE_OR_COL value) (TOK_TABLE_OR_COL key))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ tbl
+ TableScan
+ alias: tbl
+ Select Operator
+ expressions:
+ expr: value
+ type: int
+ expr: key
+ type: int
+ outputColumnNames: value, key
+ Group By Operator
+ bucketGroup: false
+ keys:
+ expr: value
+ type: int
+ expr: key
+ type: int
+ mode: hash
+ outputColumnNames: _col0, _col1
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: int
+ expr: _col1
+ type: int
+ sort order: ++
+ Map-reduce partition columns:
+ expr: _col0
+ type: int
+ expr: _col1
+ type: int
+ tag: -1
+ Reduce Operator Tree:
+ Group By Operator
+ bucketGroup: false
+ keys:
+ expr: KEY._col0
+ type: int
+ expr: KEY._col1
+ type: int
+ mode: mergepartial
+ outputColumnNames: _col0, _col1
+ Select Operator
+ expressions:
+ expr: _col1
+ type: int
+ expr: _col0
+ type: int
+ outputColumnNames: _col0, _col1
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: EXPLAIN SELECT DISTINCT key, value FROM tbl WHERE value = 2
+PREHOOK: type: QUERY
+POSTHOOK: query: EXPLAIN SELECT DISTINCT key, value FROM tbl WHERE value = 2
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ]
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ]
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ]
+POSTHOOK: Lineage: default__tbl_tbl_key_idx__._bucketname SIMPLE [(tbl)tbl.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ]
+POSTHOOK: Lineage: default__tbl_tbl_key_idx__._offsets EXPRESSION [(tbl)tbl.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ]
+POSTHOOK: Lineage: default__tbl_tbl_key_idx__.key SIMPLE [(tbl)tbl.FieldSchema(name:key, type:int, comment:null), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_TABREF tbl)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECTDI (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL value))) (TOK_WHERE (= (TOK_TABLE_OR_COL value) 2))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ tbl
+ TableScan
+ alias: tbl
+ Filter Operator
+ predicate:
+ expr: (value = 2)
+ type: boolean
+ Filter Operator
+ predicate:
+ expr: (value = 2)
+ type: boolean
+ Select Operator
+ expressions:
+ expr: key
+ type: int
+ expr: value
+ type: int
+ outputColumnNames: key, value
+ Group By Operator
+ bucketGroup: false
+ keys:
+ expr: key
+ type: int
+ expr: value
+ type: int
+ mode: hash
+ outputColumnNames: _col0, _col1
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: int
+ expr: _col1
+ type: int
+ sort order: ++
+ Map-reduce partition columns:
+ expr: _col0
+ type: int
+ expr: _col1
+ type: int
+ tag: -1
+ Reduce Operator Tree:
+ Group By Operator
+ bucketGroup: false
+ keys:
+ expr: KEY._col0
+ type: int
+ expr: KEY._col1
+ type: int
+ mode: mergepartial
+ outputColumnNames: _col0, _col1
+ Select Operator
+ expressions:
+ expr: _col0
+ type: int
+ expr: _col1
+ type: int
+ outputColumnNames: _col0, _col1
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: EXPLAIN SELECT DISTINCT key, value FROM tbl WHERE value = 2 AND key = 3
+PREHOOK: type: QUERY
+POSTHOOK: query: EXPLAIN SELECT DISTINCT key, value FROM tbl WHERE value = 2 AND key = 3
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ]
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ]
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ]
+POSTHOOK: Lineage: default__tbl_tbl_key_idx__._bucketname SIMPLE [(tbl)tbl.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ]
+POSTHOOK: Lineage: default__tbl_tbl_key_idx__._offsets EXPRESSION [(tbl)tbl.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ]
+POSTHOOK: Lineage: default__tbl_tbl_key_idx__.key SIMPLE [(tbl)tbl.FieldSchema(name:key, type:int, comment:null), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_TABREF tbl)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECTDI (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL value))) (TOK_WHERE (AND (= (TOK_TABLE_OR_COL value) 2) (= (TOK_TABLE_OR_COL key) 3)))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ tbl
+ TableScan
+ alias: tbl
+ Filter Operator
+ predicate:
+ expr: ((value = 2) and (key = 3))
+ type: boolean
+ Filter Operator
+ predicate:
+ expr: ((value = 2) and (key = 3))
+ type: boolean
+ Select Operator
+ expressions:
+ expr: key
+ type: int
+ expr: value
+ type: int
+ outputColumnNames: key, value
+ Group By Operator
+ bucketGroup: false
+ keys:
+ expr: key
+ type: int
+ expr: value
+ type: int
+ mode: hash
+ outputColumnNames: _col0, _col1
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: int
+ expr: _col1
+ type: int
+ sort order: ++
+ Map-reduce partition columns:
+ expr: _col0
+ type: int
+ expr: _col1
+ type: int
+ tag: -1
+ Reduce Operator Tree:
+ Group By Operator
+ bucketGroup: false
+ keys:
+ expr: KEY._col0
+ type: int
+ expr: KEY._col1
+ type: int
+ mode: mergepartial
+ outputColumnNames: _col0, _col1
+ Select Operator
+ expressions:
+ expr: _col0
+ type: int
+ expr: _col1
+ type: int
+ outputColumnNames: _col0, _col1
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: EXPLAIN SELECT DISTINCT key, value FROM tbl WHERE value = key
+PREHOOK: type: QUERY
+POSTHOOK: query: EXPLAIN SELECT DISTINCT key, value FROM tbl WHERE value = key
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ]
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ]
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ]
+POSTHOOK: Lineage: default__tbl_tbl_key_idx__._bucketname SIMPLE [(tbl)tbl.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ]
+POSTHOOK: Lineage: default__tbl_tbl_key_idx__._offsets EXPRESSION [(tbl)tbl.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ]
+POSTHOOK: Lineage: default__tbl_tbl_key_idx__.key SIMPLE [(tbl)tbl.FieldSchema(name:key, type:int, comment:null), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_TABREF tbl)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECTDI (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL value))) (TOK_WHERE (= (TOK_TABLE_OR_COL value) (TOK_TABLE_OR_COL key)))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ tbl
+ TableScan
+ alias: tbl
+ Filter Operator
+ predicate:
+ expr: (value = key)
+ type: boolean
+ Filter Operator
+ predicate:
+ expr: (value = key)
+ type: boolean
+ Select Operator
+ expressions:
+ expr: key
+ type: int
+ expr: value
+ type: int
+ outputColumnNames: key, value
+ Group By Operator
+ bucketGroup: false
+ keys:
+ expr: key
+ type: int
+ expr: value
+ type: int
+ mode: hash
+ outputColumnNames: _col0, _col1
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: int
+ expr: _col1
+ type: int
+ sort order: ++
+ Map-reduce partition columns:
+ expr: _col0
+ type: int
+ expr: _col1
+ type: int
+ tag: -1
+ Reduce Operator Tree:
+ Group By Operator
+ bucketGroup: false
+ keys:
+ expr: KEY._col0
+ type: int
+ expr: KEY._col1
+ type: int
+ mode: mergepartial
+ outputColumnNames: _col0, _col1
+ Select Operator
+ expressions:
+ expr: _col0
+ type: int
+ expr: _col1
+ type: int
+ outputColumnNames: _col0, _col1
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: EXPLAIN SELECT key FROM tbl WHERE key = 3 GROUP BY key
+PREHOOK: type: QUERY
+POSTHOOK: query: EXPLAIN SELECT key FROM tbl WHERE key = 3 GROUP BY key
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ]
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ]
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ]
+POSTHOOK: Lineage: default__tbl_tbl_key_idx__._bucketname SIMPLE [(tbl)tbl.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ]
+POSTHOOK: Lineage: default__tbl_tbl_key_idx__._offsets EXPRESSION [(tbl)tbl.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ]
+POSTHOOK: Lineage: default__tbl_tbl_key_idx__.key SIMPLE [(tbl)tbl.FieldSchema(name:key, type:int, comment:null), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_TABREF tbl)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key))) (TOK_WHERE (= (TOK_TABLE_OR_COL key) 3)) (TOK_GROUPBY (TOK_TABLE_OR_COL key))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ default__tbl_tbl_key_idx__
+ TableScan
+ Filter Operator
+ predicate:
+ expr: (key = 3)
+ type: boolean
+ Filter Operator
+ predicate:
+ expr: (key = 3)
+ type: boolean
+ Select Operator
+ expressions:
+ expr: key
+ type: int
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: EXPLAIN SELECT key, value FROM tbl WHERE value = 1 GROUP BY key, value
+PREHOOK: type: QUERY
+POSTHOOK: query: EXPLAIN SELECT key, value FROM tbl WHERE value = 1 GROUP BY key, value
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ]
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ]
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ]
+POSTHOOK: Lineage: default__tbl_tbl_key_idx__._bucketname SIMPLE [(tbl)tbl.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ]
+POSTHOOK: Lineage: default__tbl_tbl_key_idx__._offsets EXPRESSION [(tbl)tbl.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ]
+POSTHOOK: Lineage: default__tbl_tbl_key_idx__.key SIMPLE [(tbl)tbl.FieldSchema(name:key, type:int, comment:null), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_TABREF tbl)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL value))) (TOK_WHERE (= (TOK_TABLE_OR_COL value) 1)) (TOK_GROUPBY (TOK_TABLE_OR_COL key) (TOK_TABLE_OR_COL value))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ tbl
+ TableScan
+ alias: tbl
+ Filter Operator
+ predicate:
+ expr: (value = 1)
+ type: boolean
+ Filter Operator
+ predicate:
+ expr: (value = 1)
+ type: boolean
+ Select Operator
+ expressions:
+ expr: key
+ type: int
+ expr: value
+ type: int
+ outputColumnNames: key, value
+ Group By Operator
+ bucketGroup: false
+ keys:
+ expr: key
+ type: int
+ expr: value
+ type: int
+ mode: hash
+ outputColumnNames: _col0, _col1
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: int
+ expr: _col1
+ type: int
+ sort order: ++
+ Map-reduce partition columns:
+ expr: _col0
+ type: int
+ expr: _col1
+ type: int
+ tag: -1
+ Reduce Operator Tree:
+ Group By Operator
+ bucketGroup: false
+ keys:
+ expr: KEY._col0
+ type: int
+ expr: KEY._col1
+ type: int
+ mode: mergepartial
+ outputColumnNames: _col0, _col1
+ Select Operator
+ expressions:
+ expr: _col0
+ type: int
+ expr: _col1
+ type: int
+ outputColumnNames: _col0, _col1
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: EXPLAIN SELECT * FROM (SELECT DISTINCT key, value FROM tbl) v1 WHERE v1.value = 2
+PREHOOK: type: QUERY
+POSTHOOK: query: EXPLAIN SELECT * FROM (SELECT DISTINCT key, value FROM tbl) v1 WHERE v1.value = 2
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ]
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ]
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ]
+POSTHOOK: Lineage: default__tbl_tbl_key_idx__._bucketname SIMPLE [(tbl)tbl.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ]
+POSTHOOK: Lineage: default__tbl_tbl_key_idx__._offsets EXPRESSION [(tbl)tbl.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ]
+POSTHOOK: Lineage: default__tbl_tbl_key_idx__.key SIMPLE [(tbl)tbl.FieldSchema(name:key, type:int, comment:null), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF tbl)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECTDI (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL value))))) v1)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)) (TOK_WHERE (= (. (TOK_TABLE_OR_COL v1) value) 2))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ v1:tbl
+ TableScan
+ alias: tbl
+ Filter Operator
+ predicate:
+ expr: (value = 2)
+ type: boolean
+ Select Operator
+ expressions:
+ expr: key
+ type: int
+ expr: value
+ type: int
+ outputColumnNames: key, value
+ Group By Operator
+ bucketGroup: false
+ keys:
+ expr: key
+ type: int
+ expr: value
+ type: int
+ mode: hash
+ outputColumnNames: _col0, _col1
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: int
+ expr: _col1
+ type: int
+ sort order: ++
+ Map-reduce partition columns:
+ expr: _col0
+ type: int
+ expr: _col1
+ type: int
+ tag: -1
+ Reduce Operator Tree:
+ Group By Operator
+ bucketGroup: false
+ keys:
+ expr: KEY._col0
+ type: int
+ expr: KEY._col1
+ type: int
+ mode: mergepartial
+ outputColumnNames: _col0, _col1
+ Select Operator
+ expressions:
+ expr: _col0
+ type: int
+ expr: _col1
+ type: int
+ outputColumnNames: _col0, _col1
+ Filter Operator
+ predicate:
+ expr: (_col1 = 2)
+ type: boolean
+ Select Operator
+ expressions:
+ expr: _col0
+ type: int
+ expr: _col1
+ type: int
+ outputColumnNames: _col0, _col1
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: EXPLAIN SELECT key FROM tbl WHERE value = 2 GROUP BY key
+PREHOOK: type: QUERY
+POSTHOOK: query: EXPLAIN SELECT key FROM tbl WHERE value = 2 GROUP BY key
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ]
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ]
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ]
+POSTHOOK: Lineage: default__tbl_tbl_key_idx__._bucketname SIMPLE [(tbl)tbl.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ]
+POSTHOOK: Lineage: default__tbl_tbl_key_idx__._offsets EXPRESSION [(tbl)tbl.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ]
+POSTHOOK: Lineage: default__tbl_tbl_key_idx__.key SIMPLE [(tbl)tbl.FieldSchema(name:key, type:int, comment:null), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_TABREF tbl)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key))) (TOK_WHERE (= (TOK_TABLE_OR_COL value) 2)) (TOK_GROUPBY (TOK_TABLE_OR_COL key))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ tbl
+ TableScan
+ alias: tbl
+ Filter Operator
+ predicate:
+ expr: (value = 2)
+ type: boolean
+ Filter Operator
+ predicate:
+ expr: (value = 2)
+ type: boolean
+ Select Operator
+ expressions:
+ expr: key
+ type: int
+ outputColumnNames: key
+ Group By Operator
+ bucketGroup: false
+ keys:
+ expr: key
+ type: int
+ mode: hash
+ outputColumnNames: _col0
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: int
+ sort order: +
+ Map-reduce partition columns:
+ expr: _col0
+ type: int
+ tag: -1
+ Reduce Operator Tree:
+ Group By Operator
+ bucketGroup: false
+ keys:
+ expr: KEY._col0
+ type: int
+ mode: mergepartial
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: int
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: EXPLAIN SELECT DISTINCT key, substr(value,2,3) FROM tbl WHERE value = key
+PREHOOK: type: QUERY
+POSTHOOK: query: EXPLAIN SELECT DISTINCT key, substr(value,2,3) FROM tbl WHERE value = key
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ]
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ]
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ]
+POSTHOOK: Lineage: default__tbl_tbl_key_idx__._bucketname SIMPLE [(tbl)tbl.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ]
+POSTHOOK: Lineage: default__tbl_tbl_key_idx__._offsets EXPRESSION [(tbl)tbl.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ]
+POSTHOOK: Lineage: default__tbl_tbl_key_idx__.key SIMPLE [(tbl)tbl.FieldSchema(name:key, type:int, comment:null), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_TABREF tbl)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECTDI (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION substr (TOK_TABLE_OR_COL value) 2 3))) (TOK_WHERE (= (TOK_TABLE_OR_COL value) (TOK_TABLE_OR_COL key)))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ tbl
+ TableScan
+ alias: tbl
+ Filter Operator
+ predicate:
+ expr: (value = key)
+ type: boolean
+ Filter Operator
+ predicate:
+ expr: (value = key)
+ type: boolean
+ Select Operator
+ expressions:
+ expr: key
+ type: int
+ expr: value
+ type: int
+ outputColumnNames: key, value
+ Group By Operator
+ bucketGroup: false
+ keys:
+ expr: key
+ type: int
+ expr: substr(value, 2, 3)
+ type: string
+ mode: hash
+ outputColumnNames: _col0, _col1
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: int
+ expr: _col1
+ type: string
+ sort order: ++
+ Map-reduce partition columns:
+ expr: _col0
+ type: int
+ expr: _col1
+ type: string
+ tag: -1
+ Reduce Operator Tree:
+ Group By Operator
+ bucketGroup: false
+ keys:
+ expr: KEY._col0
+ type: int
+ expr: KEY._col1
+ type: string
+ mode: mergepartial
+ outputColumnNames: _col0, _col1
+ Select Operator
+ expressions:
+ expr: _col0
+ type: int
+ expr: _col1
+ type: string
+ outputColumnNames: _col0, _col1
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: EXPLAIN SELECT DISTINCT key, substr(value,2,3) FROM tbl
+PREHOOK: type: QUERY
+POSTHOOK: query: EXPLAIN SELECT DISTINCT key, substr(value,2,3) FROM tbl
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ]
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ]
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ]
+POSTHOOK: Lineage: default__tbl_tbl_key_idx__._bucketname SIMPLE [(tbl)tbl.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ]
+POSTHOOK: Lineage: default__tbl_tbl_key_idx__._offsets EXPRESSION [(tbl)tbl.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ]
+POSTHOOK: Lineage: default__tbl_tbl_key_idx__.key SIMPLE [(tbl)tbl.FieldSchema(name:key, type:int, comment:null), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_TABREF tbl)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECTDI (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION substr (TOK_TABLE_OR_COL value) 2 3)))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ tbl
+ TableScan
+ alias: tbl
+ Select Operator
+ expressions:
+ expr: key
+ type: int
+ expr: value
+ type: int
+ outputColumnNames: key, value
+ Group By Operator
+ bucketGroup: false
+ keys:
+ expr: key
+ type: int
+ expr: substr(value, 2, 3)
+ type: string
+ mode: hash
+ outputColumnNames: _col0, _col1
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: int
+ expr: _col1
+ type: string
+ sort order: ++
+ Map-reduce partition columns:
+ expr: _col0
+ type: int
+ expr: _col1
+ type: string
+ tag: -1
+ Reduce Operator Tree:
+ Group By Operator
+ bucketGroup: false
+ keys:
+ expr: KEY._col0
+ type: int
+ expr: KEY._col1
+ type: string
+ mode: mergepartial
+ outputColumnNames: _col0, _col1
+ Select Operator
+ expressions:
+ expr: _col0
+ type: int
+ expr: _col1
+ type: string
+ outputColumnNames: _col0, _col1
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: EXPLAIN SELECT key FROM tbl GROUP BY key, substr(key,2,3)
+PREHOOK: type: QUERY
+POSTHOOK: query: EXPLAIN SELECT key FROM tbl GROUP BY key, substr(key,2,3)
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ]
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ]
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ]
+POSTHOOK: Lineage: default__tbl_tbl_key_idx__._bucketname SIMPLE [(tbl)tbl.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ]
+POSTHOOK: Lineage: default__tbl_tbl_key_idx__._offsets EXPRESSION [(tbl)tbl.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ]
+POSTHOOK: Lineage: default__tbl_tbl_key_idx__.key SIMPLE [(tbl)tbl.FieldSchema(name:key, type:int, comment:null), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_TABREF tbl)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key))) (TOK_GROUPBY (TOK_TABLE_OR_COL key) (TOK_FUNCTION substr (TOK_TABLE_OR_COL key) 2 3))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ tbl
+ TableScan
+ alias: tbl
+ Select Operator
+ expressions:
+ expr: key
+ type: int
+ outputColumnNames: key
+ Group By Operator
+ bucketGroup: false
+ keys:
+ expr: key
+ type: int
+ expr: substr(key, 2, 3)
+ type: string
+ mode: hash
+ outputColumnNames: _col0, _col1
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: int
+ expr: _col1
+ type: string
+ sort order: ++
+ Map-reduce partition columns:
+ expr: _col0
+ type: int
+ expr: _col1
+ type: string
+ tag: -1
+ Reduce Operator Tree:
+ Group By Operator
+ bucketGroup: false
+ keys:
+ expr: KEY._col0
+ type: int
+ expr: KEY._col1
+ type: string
+ mode: mergepartial
+ outputColumnNames: _col0, _col1
+ Select Operator
+ expressions:
+ expr: _col0
+ type: int
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: DROP TABLE tbl
+PREHOOK: type: DROPTABLE
+PREHOOK: Input: default@tbl
+PREHOOK: Output: default@tbl
+POSTHOOK: query: DROP TABLE tbl
+POSTHOOK: type: DROPTABLE
+POSTHOOK: Input: default@tbl
+POSTHOOK: Output: default@tbl
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ]
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ]
+POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ]
+POSTHOOK: Lineage: default__tbl_tbl_key_idx__._bucketname SIMPLE [(tbl)tbl.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ]
+POSTHOOK: Lineage: default__tbl_tbl_key_idx__._offsets EXPRESSION [(tbl)tbl.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ]
+POSTHOOK: Lineage: default__tbl_tbl_key_idx__.key SIMPLE [(tbl)tbl.FieldSchema(name:key, type:int, comment:null), ]