diff --git common/src/java/org/apache/hadoop/hive/conf/HiveConf.java common/src/java/org/apache/hadoop/hive/conf/HiveConf.java index 46739b7..d9681e5 100644 --- common/src/java/org/apache/hadoop/hive/conf/HiveConf.java +++ common/src/java/org/apache/hadoop/hive/conf/HiveConf.java @@ -353,6 +353,10 @@ public class HiveConf extends Configuration { // For har files HIVEARCHIVEENABLED("hive.archive.enabled", false), HIVEHARPARENTDIRSETTABLE("hive.archive.har.parentdir.settable", false), + + //Enable/Disable gbToIdx rewrite rule + HIVEOPTGBYUSINGINDEX("hive.optimize.index.groupby", false), + HIVEOUTERJOINSUPPORTSFILTERS("hive.outerjoin.supports.filters", true), // Serde for FetchTask diff --git ql/src/java/org/apache/hadoop/hive/ql/index/AggregateIndexHandler.java ql/src/java/org/apache/hadoop/hive/ql/index/AggregateIndexHandler.java new file mode 100644 index 0000000..9948189 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/index/AggregateIndexHandler.java @@ -0,0 +1,143 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.index; + +import java.util.Iterator; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Set; + +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.metastore.api.FieldSchema; +import org.apache.hadoop.hive.metastore.api.Index; +import org.apache.hadoop.hive.metastore.api.StorageDescriptor; +import org.apache.hadoop.hive.metastore.api.Table; +import org.apache.hadoop.hive.ql.Driver; +import org.apache.hadoop.hive.ql.exec.Task; +import org.apache.hadoop.hive.ql.hooks.ReadEntity; +import org.apache.hadoop.hive.ql.hooks.WriteEntity; +import org.apache.hadoop.hive.ql.index.compact.IndexMetadataChangeTask; +import org.apache.hadoop.hive.ql.index.compact.IndexMetadataChangeWork; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.metadata.HiveUtils; +import org.apache.hadoop.hive.ql.metadata.VirtualColumn; +import org.apache.hadoop.hive.ql.plan.PartitionDesc; + + +public class AggregateIndexHandler extends TableBasedIndexHandler { + + @Override + public void analyzeIndexDefinition(Table baseTable, Index index, + Table indexTable) throws HiveException { + StorageDescriptor storageDesc = index.getSd(); + if (this.usesIndexTable() && indexTable != null) { + StorageDescriptor indexTableSd = storageDesc.deepCopy(); + List indexTblCols = indexTableSd.getCols(); + FieldSchema bucketFileName = new FieldSchema("_bucketname", "string", ""); + indexTblCols.add(bucketFileName); + FieldSchema offSets = new FieldSchema("_offsets", "array", ""); + indexTblCols.add(offSets); + FieldSchema countkey = new FieldSchema("_countkey", "int", ""); + indexTblCols.add(countkey); + FieldSchema countall = new FieldSchema("_countall", "int", ""); + indexTblCols.add(countall); + indexTable.setSd(indexTableSd); + } + } + + + @Override + protected Task getIndexBuilderMapRedTask(Set inputs, Set outputs, + List indexField, boolean partitioned, + PartitionDesc indexTblPartDesc, String indexTableName, + PartitionDesc baseTablePartDesc, String baseTableName, String dbName) { + + String indexCols = HiveUtils.getUnparsedColumnNamesFromFieldSchema(indexField); + + //form a new insert overwrite query. + StringBuilder command= new StringBuilder(); + LinkedHashMap partSpec = indexTblPartDesc.getPartSpec(); + + command.append("INSERT OVERWRITE TABLE " + HiveUtils.unparseIdentifier(indexTableName )); + if (partitioned && indexTblPartDesc != null) { + command.append(" PARTITION ( "); + List ret = getPartKVPairStringArray(partSpec); + for (int i = 0; i < ret.size(); i++) { + String partKV = ret.get(i); + command.append(partKV); + if (i < ret.size() - 1) { + command.append(","); + } + } + command.append(" ) "); + } + + command.append(" SELECT "); + command.append(indexCols); + command.append(","); + + command.append(VirtualColumn.FILENAME.getName()); + command.append(","); + command.append(" collect_set ("); + command.append(VirtualColumn.BLOCKOFFSET.getName()); + command.append(") "); + command.append(","); + + Iterator fsItr = indexField.iterator(); + while(fsItr.hasNext()){ + FieldSchema indexColFs = fsItr.next(); + String indexCol = indexColFs.getName(); + command.append(" count("); + command.append(indexCol); + command.append(") "); + command.append(","); + } + + command.append(" count(*) "); + command.append(" FROM " + HiveUtils.unparseIdentifier(baseTableName) ); + LinkedHashMap basePartSpec = baseTablePartDesc.getPartSpec(); + if(basePartSpec != null) { + command.append(" WHERE "); + List pkv = getPartKVPairStringArray(basePartSpec); + for (int i = 0; i < pkv.size(); i++) { + String partKV = pkv.get(i); + command.append(partKV); + if (i < pkv.size() - 1) { + command.append(" AND "); + } + } + } + command.append(" GROUP BY "); + command.append(indexCols + ", " + VirtualColumn.FILENAME.getName()); + + Driver driver = new Driver(new HiveConf(getConf(), AggregateIndexHandler.class)); + driver.compile(command.toString()); + + Task rootTask = driver.getPlan().getRootTasks().get(0); + inputs.addAll(driver.getPlan().getInputs()); + outputs.addAll(driver.getPlan().getOutputs()); + + IndexMetadataChangeWork indexMetaChange = new IndexMetadataChangeWork(partSpec, indexTableName, dbName); + IndexMetadataChangeTask indexMetaChangeTsk = new IndexMetadataChangeTask(); + indexMetaChangeTsk.setWork(indexMetaChange); + rootTask.addDependentTask(indexMetaChangeTsk); + + return rootTask; + } + } \ No newline at end of file diff --git ql/src/java/org/apache/hadoop/hive/ql/index/HiveIndex.java ql/src/java/org/apache/hadoop/hive/ql/index/HiveIndex.java index 308d985..e2e69da 100644 --- ql/src/java/org/apache/hadoop/hive/ql/index/HiveIndex.java +++ ql/src/java/org/apache/hadoop/hive/ql/index/HiveIndex.java @@ -30,7 +30,8 @@ public class HiveIndex { public static String INDEX_TABLE_CREATETIME = "hive.index.basetbl.dfs.lastModifiedTime"; public static enum IndexType { - COMPACT_SUMMARY_TABLE("compact", "org.apache.hadoop.hive.ql.index.compact.CompactIndexHandler"); + COMPACT_SUMMARY_TABLE("compact", "org.apache.hadoop.hive.ql.index.compact.CompactIndexHandler"), + AGGREGATE_TABLE("aggregate", "org.apache.hadoop.hive.ql.AggregateIndexHandler"); private IndexType(String indexType, String className) { indexTypeName = indexType; diff --git ql/src/java/org/apache/hadoop/hive/ql/index/TableBasedIndexHandler.java ql/src/java/org/apache/hadoop/hive/ql/index/TableBasedIndexHandler.java new file mode 100644 index 0000000..81a9cd9 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/index/TableBasedIndexHandler.java @@ -0,0 +1,138 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.index; + +import java.util.ArrayList; +import java.util.Iterator; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Set; +import java.util.Map.Entry; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hive.metastore.api.FieldSchema; +import org.apache.hadoop.hive.ql.exec.Task; +import org.apache.hadoop.hive.ql.exec.Utilities; +import org.apache.hadoop.hive.ql.hooks.ReadEntity; +import org.apache.hadoop.hive.ql.hooks.WriteEntity; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.metadata.HiveUtils; +import org.apache.hadoop.hive.ql.metadata.Partition; +import org.apache.hadoop.hive.ql.parse.SemanticException; +import org.apache.hadoop.hive.ql.plan.PartitionDesc; +import org.apache.hadoop.hive.ql.plan.TableDesc; + +/* + * Borrowed this code from HIVE-1803.6.patch + * */ +public abstract class TableBasedIndexHandler extends AbstractIndexHandler { + protected Configuration configuration; + + @Override + public List> generateIndexBuildTaskList( + org.apache.hadoop.hive.ql.metadata.Table baseTbl, + org.apache.hadoop.hive.metastore.api.Index index, + List indexTblPartitions, List baseTblPartitions, + org.apache.hadoop.hive.ql.metadata.Table indexTbl, + Set inputs, Set outputs) throws HiveException { + try { + + TableDesc desc = Utilities.getTableDesc(indexTbl); + + List newBaseTblPartitions = new ArrayList(); + + List> indexBuilderTasks = new ArrayList>(); + + if (!baseTbl.isPartitioned()) { + // the table does not have any partition, then create index for the + // whole table + Task indexBuilder = getIndexBuilderMapRedTask(inputs, outputs, index.getSd().getCols(), false, + new PartitionDesc(desc, null), indexTbl.getTableName(), + new PartitionDesc(Utilities.getTableDesc(baseTbl), null), + baseTbl.getTableName(), indexTbl.getDbName()); + indexBuilderTasks.add(indexBuilder); + } else { + + // check whether the index table partitions are still exists in base + // table + for (int i = 0; i < indexTblPartitions.size(); i++) { + Partition indexPart = indexTblPartitions.get(i); + Partition basePart = null; + for (int j = 0; j < baseTblPartitions.size(); j++) { + if (baseTblPartitions.get(j).getName().equals(indexPart.getName())) { + basePart = baseTblPartitions.get(j); + newBaseTblPartitions.add(baseTblPartitions.get(j)); + break; + } + } + if (basePart == null) { + throw new RuntimeException( + "Partitions of base table and index table are inconsistent."); + } + // for each partition, spawn a map reduce task. + Task indexBuilder = getIndexBuilderMapRedTask(inputs, outputs, index.getSd().getCols(), true, + new PartitionDesc(indexPart), indexTbl.getTableName(), + new PartitionDesc(basePart), baseTbl.getTableName(), indexTbl.getDbName()); + indexBuilderTasks.add(indexBuilder); + } + } + return indexBuilderTasks; + } catch (Exception e) { + throw new SemanticException(e); + } + } + + abstract protected Task getIndexBuilderMapRedTask(Set inputs, Set outputs, + List indexField, boolean partitioned, + PartitionDesc indexTblPartDesc, String indexTableName, + PartitionDesc baseTablePartDesc, String baseTableName, String dbName); + + protected List getPartKVPairStringArray( + LinkedHashMap partSpec) { + List ret = new ArrayList(partSpec.size()); + Iterator> iter = partSpec.entrySet().iterator(); + while (iter.hasNext()) { + StringBuilder sb = new StringBuilder(); + Entry p = iter.next(); + sb.append(HiveUtils.unparseIdentifier(p.getKey())); + sb.append(" = "); + sb.append("'"); + sb.append(HiveUtils.escapeString(p.getValue())); + sb.append("'"); + ret.add(sb.toString()); + } + return ret; + } + + @Override + public boolean usesIndexTable() { + return true; + } + + @Override + public Configuration getConf() { + return configuration; + } + + @Override + public void setConf(Configuration conf) { + this.configuration = conf; + } + +} diff --git ql/src/java/org/apache/hadoop/hive/ql/metadata/..orig ql/src/java/org/apache/hadoop/hive/ql/metadata/..orig new file mode 100644 index 0000000..e69de29 diff --git ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java index 916b235..ba5a60e 100644 --- ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java +++ ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java @@ -680,6 +680,16 @@ public class Hive { throw new HiveException(e); } } + public List getIndexesOnTable(String db_name, String tbl_name, + short max) throws HiveException { + try { + return getMSC().listIndexes(db_name, tbl_name, max); + } catch (NoSuchObjectException e) { + throw new HiveException("Partition or table doesn't exist.", e); + } catch (Exception e) { + throw new HiveException("Unknown error. Please check logs.", e); + } + } public boolean dropIndex(String db_name, String tbl_name, String index_name, boolean deleteData) throws HiveException { try { @@ -687,7 +697,7 @@ public class Hive { } catch (NoSuchObjectException e) { throw new HiveException("Partition or table doesn't exist.", e); } catch (Exception e) { - throw new HiveException("Unknow error. Please check logs.", e); + throw new HiveException("Unknown error. Please check logs.", e); } } @@ -1332,7 +1342,7 @@ public class Hive { } catch (NoSuchObjectException e) { throw new HiveException("Partition or table doesn't exist.", e); } catch (Exception e) { - throw new HiveException("Unknow error. Please check logs.", e); + throw new HiveException("Unknown error. Please check logs.", e); } } @@ -1927,6 +1937,4 @@ public class Hive { private static String[] getQualifiedNames(String qualifiedName) { return qualifiedName.split("\\."); } - - }; diff --git ql/src/java/org/apache/hadoop/hive/ql/metadata/Table.java ql/src/java/org/apache/hadoop/hive/ql/metadata/Table.java index 50db44c..a23ed09 100644 --- ql/src/java/org/apache/hadoop/hive/ql/metadata/Table.java +++ ql/src/java/org/apache/hadoop/hive/ql/metadata/Table.java @@ -38,6 +38,7 @@ import org.apache.hadoop.hive.metastore.MetaStoreUtils; import org.apache.hadoop.hive.metastore.ProtectMode; import org.apache.hadoop.hive.metastore.TableType; import org.apache.hadoop.hive.metastore.api.FieldSchema; +import org.apache.hadoop.hive.metastore.api.Index; import org.apache.hadoop.hive.metastore.api.MetaException; import org.apache.hadoop.hive.metastore.api.Order; import org.apache.hadoop.hive.metastore.api.SerDeInfo; @@ -813,4 +814,15 @@ public class Table implements Serializable { public String getCompleteName() { return getDbName() + "@" + getTableName(); } + + /** + * @return List containing Index Table names if there is exists indexes + * on this table + * @throws HiveException + **/ + public List getAllIndexes(short max) throws HiveException { + Hive hive = Hive.get(); + return hive.getIndexesOnTable(getTTable().getDbName(), getTTable().getTableName(), max); + } + }; diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java index 590d69a..be04c5e 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java @@ -22,6 +22,7 @@ import java.util.ArrayList; import java.util.List; import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.ql.optimizer.index.RewriteGBUsingIndex; import org.apache.hadoop.hive.ql.optimizer.lineage.Generator; import org.apache.hadoop.hive.ql.optimizer.pcr.PartitionConditionRemover; import org.apache.hadoop.hive.ql.optimizer.ppr.PartitionPruner; @@ -49,6 +50,9 @@ public class Optimizer { if (HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVEOPTCP)) { transformations.add(new ColumnPruner()); } + if (HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVEOPTGBYUSINGINDEX)) { + transformations.add(new RewriteGBUsingIndex()); + } if (HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVEOPTPPD)) { transformations.add(new PredicatePushDown()); transformations.add(new PartitionPruner()); diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/RewriteParseContextGenerator.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/RewriteParseContextGenerator.java new file mode 100644 index 0000000..8cc0999 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/RewriteParseContextGenerator.java @@ -0,0 +1,120 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.optimizer; + +import java.io.IOException; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.ql.Context; +import org.apache.hadoop.hive.ql.optimizer.index.RewriteGBUsingIndex; +import org.apache.hadoop.hive.ql.optimizer.index.RewriteIndexSubqueryProcFactory; +import org.apache.hadoop.hive.ql.parse.ASTNode; +import org.apache.hadoop.hive.ql.parse.BaseSemanticAnalyzer; +import org.apache.hadoop.hive.ql.parse.ParseContext; +import org.apache.hadoop.hive.ql.parse.ParseDriver; +import org.apache.hadoop.hive.ql.parse.ParseException; +import org.apache.hadoop.hive.ql.parse.ParseUtils; +import org.apache.hadoop.hive.ql.parse.QB; +import org.apache.hadoop.hive.ql.parse.SemanticAnalyzer; +import org.apache.hadoop.hive.ql.parse.SemanticAnalyzerFactory; +import org.apache.hadoop.hive.ql.parse.SemanticException; + +/** + * RewriteParseContextGenerator is a class that offers methods to generate operator tree + * for input queries. It is implemented on lines of the analyzeInternal(..) method + * of {@link SemanticAnalyzer} but it creates only the ParseContext for the input query command. + * It does not optimize or generate map-reduce tasks for the input query. + * This can be used when you need to create operator tree for an internal query. + * For example, {@link RewriteGBUsingIndex} uses the {@link RewriteIndexSubqueryProcFactory} methods to + * generate subquery that scans over index table rather than original table. + * + */ +public final class RewriteParseContextGenerator { + protected static Log LOG = LogFactory.getLog(RewriteParseContextGenerator.class.getName()); + + /** + * Parse the input {@link String} command and generate a ASTNode tree + * @param conf + * @param command + * @return + */ + public static ParseContext generateOperatorTree(HiveConf conf, String command){ + Context ctx; + ParseContext subPCtx = null; + try { + ctx = new Context(conf); + ParseDriver pd = new ParseDriver(); + ASTNode tree = pd.parse(command, ctx); + tree = ParseUtils.findRootNonNullToken(tree); + + BaseSemanticAnalyzer sem = SemanticAnalyzerFactory.get(conf, tree); + doSemanticAnalysis(sem, tree, ctx); + + subPCtx = ((SemanticAnalyzer) sem).getParseContext(); + LOG.info("Sub-query Semantic Analysis Completed"); + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } catch (ParseException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } catch (SemanticException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + return subPCtx; + + } + + /** + * For the input ASTNode tree, perform a semantic analysis and check metadata + * Generate a operator tree and return the {@link ParseContext} instance for the operator tree + * + * @param ctx + * @param sem + * @param ast + * @return + * @throws SemanticException + */ + private static void doSemanticAnalysis(BaseSemanticAnalyzer sem, ASTNode ast, Context ctx) throws SemanticException { + + if(sem instanceof SemanticAnalyzer){ + QB qb = new QB(null, null, false); + ASTNode child = ast; + ParseContext subPCtx = ((SemanticAnalyzer) sem).getParseContext(); + subPCtx.setContext(ctx); + ((SemanticAnalyzer) sem).init(subPCtx); + + LOG.info("Starting Sub-query Semantic Analysis"); + ((SemanticAnalyzer) sem).doPhase1(child, qb, ((SemanticAnalyzer) sem).initPhase1Ctx()); + LOG.info("Completed phase 1 of Sub-query Semantic Analysis"); + + ((SemanticAnalyzer) sem).getMetaData(qb); + LOG.info("Completed getting MetaData in Sub-query Semantic Analysis"); + + LOG.info("Sub-query Abstract syntax tree: " + ast.toStringTree()); + ((SemanticAnalyzer) sem).genPlan(qb); + + LOG.info("Sub-query Completed plan generation"); + } + } + +} diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/index/RewriteCanApplyCtx.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/index/RewriteCanApplyCtx.java new file mode 100644 index 0000000..7619c0f --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/index/RewriteCanApplyCtx.java @@ -0,0 +1,339 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.optimizer.index; + +import java.io.Serializable; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.LinkedHashMap; +import java.util.LinkedHashSet; +import java.util.Map; +import java.util.Set; +import java.util.Stack; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hive.metastore.api.Index; +import org.apache.hadoop.hive.ql.exec.Operator; +import org.apache.hadoop.hive.ql.lib.DefaultGraphWalker; +import org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher; +import org.apache.hadoop.hive.ql.lib.Dispatcher; +import org.apache.hadoop.hive.ql.lib.GraphWalker; +import org.apache.hadoop.hive.ql.lib.Node; +import org.apache.hadoop.hive.ql.lib.NodeProcessor; +import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx; +import org.apache.hadoop.hive.ql.lib.PreOrderWalker; +import org.apache.hadoop.hive.ql.lib.Rule; +import org.apache.hadoop.hive.ql.lib.RuleRegExp; +import org.apache.hadoop.hive.ql.parse.ParseContext; +import org.apache.hadoop.hive.ql.parse.SemanticException; + +/** + * RewriteCanApplyCtx class stores the context for the {@link RewriteCanApplyProcFactory} to determine + * if any index can be used and if the input query meets all the criteria for rewrite optimization. + */ +public final class RewriteCanApplyCtx implements NodeProcessorCtx { + + protected final Log LOG = LogFactory.getLog(RewriteCanApplyCtx.class.getName()); + + private RewriteCanApplyCtx(ParseContext parseContext) { + this.parseContext = parseContext; + } + + public static RewriteCanApplyCtx getInstance(ParseContext parseContext){ + return new RewriteCanApplyCtx(parseContext); + } + + // Rewrite Variables + public int agg_func_cnt = 0; + public int gby_key_cnt = 0; + public boolean query_has_sort_by = false; + public boolean query_has_order_by = false; + public boolean query_has_distribute_by = false; + public boolean query_has_group_by = false; + public boolean query_has_distinct = false; + public boolean agg_func_is_not_count = false; + public boolean agg_func_cols_fetch_exception = false; + public boolean whr_clause_cols_fetch_exception = false; + public boolean sel_clause_cols_fetch_exception = false; + public boolean gby_keys_fetch_exception = false; + public boolean count_on_all_cols = false; + public boolean query_has_generic_udf_on_groupby_key = false; + public boolean query_has_multiple_tables = false; + public boolean should_append_subquery = false; + public boolean remove_group_by = false; + + + //Data structures that are populated in the RewriteCanApplyProcFactory methods to check if the index key meets all criteria + Set selectColumnsList = new LinkedHashSet(); + Set predicateColumnsList = new LinkedHashSet(); + Set gbKeyNameList = new LinkedHashSet(); + Set aggFuncColList = new LinkedHashSet(); + + private int aggFuncCnt = 0; + private final ParseContext parseContext; + private String baseTableName = ""; + + void resetCanApplyCtx(){ + agg_func_cnt = 0; + gby_key_cnt = 0; + query_has_sort_by = false; + query_has_order_by = false; + query_has_distribute_by = false; + query_has_group_by = false; + query_has_distinct = false; + agg_func_is_not_count = false; + agg_func_cols_fetch_exception = false; + whr_clause_cols_fetch_exception = false; + sel_clause_cols_fetch_exception = false; + gby_keys_fetch_exception = false; + count_on_all_cols = false; + query_has_generic_udf_on_groupby_key = false; + query_has_multiple_tables = false; + should_append_subquery = false; + remove_group_by = false; + + aggFuncCnt = 0; + selectColumnsList.clear(); + predicateColumnsList.clear(); + gbKeyNameList.clear(); + aggFuncColList.clear(); + baseTableName = ""; + } + + public Set getSelectColumnsList() { + return selectColumnsList; + } + + public void setSelectColumnsList(Set selectColumnsList) { + this.selectColumnsList = selectColumnsList; + } + + public Set getPredicateColumnsList() { + return predicateColumnsList; + } + + public void setPredicateColumnsList(Set predicateColumnsList) { + this.predicateColumnsList = predicateColumnsList; + } + + public Set getGbKeyNameList() { + return gbKeyNameList; + } + + public void setGbKeyNameList(Set gbKeyNameList) { + this.gbKeyNameList = gbKeyNameList; + } + + public Set getAggFuncColList() { + return aggFuncColList; + } + + public void setAggFuncColList(Set aggFuncColList) { + this.aggFuncColList = aggFuncColList; + } + + public int getAggFuncCnt() { + return aggFuncCnt; + } + + public void setAggFuncCnt(int aggFuncCnt) { + this.aggFuncCnt = aggFuncCnt; + } + + public String getBaseTableName() { + return baseTableName; + } + + public void setBaseTableName(String baseTableName) { + this.baseTableName = baseTableName; + } + + public ParseContext getParseContext() { + return parseContext; + } + + + /** + * This method walks all the nodes starting from topOp TableScanOperator node + * and invokes methods from {@link RewriteCanApplyProcFactory} for each of the rules + * added to the opRules map. We use the {@link DefaultGraphWalker} for a post-order + * traversal of the operator tree. + * + * The methods from {@link RewriteCanApplyProcFactory} set appropriate values in + * {@link RewriteVars} enum. + * + * @param topOp + */ + void populateRewriteVars(Operator topOp){ + Map opRules = new LinkedHashMap(); + opRules.put(new RuleRegExp("R1", "FIL%"), RewriteCanApplyProcFactory.canApplyOnFilterOperator()); + opRules.put(new RuleRegExp("R2", "GBY%"), RewriteCanApplyProcFactory.canApplyOnGroupByOperator()); + opRules.put(new RuleRegExp("R3", "RS%OP%"), RewriteCanApplyProcFactory.canApplyOnExtractOperator()); + opRules.put(new RuleRegExp("R4", "SEL%"), RewriteCanApplyProcFactory.canApplyOnSelectOperator()); + + // The dispatcher fires the processor corresponding to the closest matching + // rule and passes the context along + Dispatcher disp = new DefaultRuleDispatcher(getDefaultProc(), opRules, this); + GraphWalker ogw = new PreOrderWalker(disp); + + // Create a list of topop nodes + ArrayList topNodes = new ArrayList(); + topNodes.add(topOp); + + try { + ogw.startWalking(topNodes, null); + } catch (SemanticException e) { + LOG.info("Exception in walking operator tree. Rewrite variables not populated", e); + } + + } + + + /** + * Default procedure for {@link DefaultRuleDispatcher} + * @return + */ + private NodeProcessor getDefaultProc() { + return new NodeProcessor() { + @Override + public Object process(Node nd, Stack stack, + NodeProcessorCtx procCtx, Object... nodeOutputs) throws SemanticException { + return null; + } + }; + } + + + //Map for base table to index table mapping + //TableScan operator for base table will be modified to read from index table + private final HashMap baseToIdxTableMap = new HashMap();; + + + public void addTable(String baseTableName, String indexTableName) { + baseToIdxTableMap.put(baseTableName, indexTableName); + } + + public String findBaseTable(String baseTableName) { + return baseToIdxTableMap.get(baseTableName); + } + + + boolean isIndexUsableForQueryBranchRewrite(Index index, Set indexKeyNames){ + boolean removeGroupBy = true; + boolean optimizeCount = false; + + //-------------------------------------------- + //Check if all columns in select list are part of index key columns + if (!indexKeyNames.containsAll(selectColumnsList)) { + LOG.info("Select list has non index key column : " + + " Cannot use index " + index.getIndexName()); + return false; + } + + //-------------------------------------------- + // Check if all columns in where predicate are part of index key columns + // TODO: Currently we allow all predicates , would it be more efficient + // (or at least not worse) to read from index_table and not from baseTable? + if (!indexKeyNames.containsAll(predicateColumnsList)) { + LOG.info("Predicate column ref list has non index key column : " + + " Cannot use index " + index.getIndexName()); + return false; + } + + //-------------------------------------------- + // For group by, we need to check if all keys are from index columns + // itself. Here GB key order can be different than index columns but that does + // not really matter for final result. + // E.g. select c1, c2 from src group by c2, c1; + // we can rewrite this one to: + // select c1, c2 from src_cmpt_idx; + if (!indexKeyNames.containsAll(gbKeyNameList)) { + LOG.info("Group by key has some non-indexed columns, " + + " Cannot use index " + index.getIndexName()); + return false; + } + + // FUTURE: See if this can be relaxed. + // If we have agg function (currently only COUNT is supported), check if its input are + // from index. we currently support only that. + if (aggFuncColList.size() > 0) { + if (indexKeyNames.containsAll(aggFuncColList) == false) { + LOG.info("Agg Func input is not present in index key columns. Currently " + + "only agg func on index columns are supported by rewrite optimization" ); + return false; + } + // If we have count on some key, check if key is same as index key, + if (aggFuncColList.containsAll(indexKeyNames)) { + optimizeCount = true; + } + } + + if (!gbKeyNameList.containsAll(indexKeyNames)) { + // GB key and idx key are not same, don't remove GroupBy, but still do index scan + LOG.info("Index has some non-groupby columns, GroupBy will be" + + " preserved by rewrite optimization but original table scan" + + " will be replaced with index table scan." ); + removeGroupBy = false; + } + + // This check prevents to remove GroupBy for cases where the GROUP BY key cols are + // not simple expressions i.e. simple index key cols (in any order), but some + // expressions on the the key cols. + // e.g. + // 1. GROUP BY key, f(key) + // FUTURE: If f(key) output is functionally dependent on key, then we should support + // it. However we don't have mechanism/info about f() yet to decide that. + // 2. GROUP BY idxKey, 1 + // FUTURE: GB Key has literals along with idxKeyCols. Develop a rewrite to eliminate the + // literals from GB key. + // 3. GROUP BY idxKey, idxKey + // FUTURE: GB Key has dup idxKeyCols. Develop a rewrite to eliminate the dup key cols + // from GB key. + if (query_has_group_by && + indexKeyNames.size() < gby_key_cnt) { + LOG.info("Group by key has some non-indexed columns, GroupBy will be" + + " preserved by rewrite optimization" ); + removeGroupBy = false; + } + + + //Now that we are good to do this optimization, set parameters in context + //which would be used by transformation procedure as inputs. + + //sub-query is needed only in case of optimizecount and complex gb keys? + if(query_has_generic_udf_on_groupby_key == false + && !(optimizeCount == true && removeGroupBy == false) ) { + remove_group_by = removeGroupBy; + addTable(baseTableName, index.getIndexTableName()); + }else if(query_has_generic_udf_on_groupby_key == true && + agg_func_cnt == 1 && + agg_func_is_not_count == false){ + should_append_subquery = true; + addTable(baseTableName, index.getIndexTableName()); + }else{ + LOG.info("No valid criteria met to apply rewrite." ); + return false; + } + + return true; + } + + +} diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/index/RewriteCanApplyProcFactory.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/index/RewriteCanApplyProcFactory.java new file mode 100644 index 0000000..73b161d --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/index/RewriteCanApplyProcFactory.java @@ -0,0 +1,312 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.optimizer.index; + +import java.io.Serializable; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.Stack; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hive.ql.exec.ColumnInfo; +import org.apache.hadoop.hive.ql.exec.ExtractOperator; +import org.apache.hadoop.hive.ql.exec.FileSinkOperator; +import org.apache.hadoop.hive.ql.exec.FilterOperator; +import org.apache.hadoop.hive.ql.exec.GroupByOperator; +import org.apache.hadoop.hive.ql.exec.Operator; +import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator; +import org.apache.hadoop.hive.ql.exec.RowSchema; +import org.apache.hadoop.hive.ql.exec.SelectOperator; +import org.apache.hadoop.hive.ql.lib.Node; +import org.apache.hadoop.hive.ql.lib.NodeProcessor; +import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx; +import org.apache.hadoop.hive.ql.parse.ASTNode; +import org.apache.hadoop.hive.ql.parse.HiveParser; +import org.apache.hadoop.hive.ql.parse.QBParseInfo; +import org.apache.hadoop.hive.ql.parse.SemanticException; +import org.apache.hadoop.hive.ql.plan.AggregationDesc; +import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc; +import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; +import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc; +import org.apache.hadoop.hive.ql.plan.FilterDesc; +import org.apache.hadoop.hive.ql.plan.GroupByDesc; +import org.apache.hadoop.hive.ql.plan.ReduceSinkDesc; + +/** + * Factory of methods used by {@link RewriteGBUsingIndex} (see checkEachDAGOperator(..) method) + * to determine if the rewrite optimization can be applied to the input query + * + */ +public final class RewriteCanApplyProcFactory { + protected final static Log LOG = LogFactory.getLog(RewriteCanApplyProcFactory.class.getName()); + private static RewriteCanApplyCtx canApplyCtx = null; + + private RewriteCanApplyProcFactory(){ + //this prevents the class from getting instantiated + } + + + /** + * Check for conditions in FilterOperator that do not meet rewrite criteria. + * Set the appropriate variables in {@link RewriteVars} enum. + */ + private static class CheckFilterProc implements NodeProcessor { + public Object process(Node nd, Stack stack, NodeProcessorCtx ctx, + Object... nodeOutputs) throws SemanticException { + FilterOperator operator = (FilterOperator)nd; + canApplyCtx = (RewriteCanApplyCtx)ctx; + FilterDesc conf = (FilterDesc)operator.getConf(); + //The filter operator should have a predicate of ExprNodeGenericFuncDesc type. + //This represents the comparison operator + ExprNodeGenericFuncDesc oldengfd = (ExprNodeGenericFuncDesc) conf.getPredicate(); + if(oldengfd == null){ + canApplyCtx.whr_clause_cols_fetch_exception = true; + } + //The predicate should have valid left and right columns + List colList = oldengfd.getCols(); + if(colList == null || colList.size() == 0){ + canApplyCtx.whr_clause_cols_fetch_exception = true; + } + //Add the predicate columns to RewriteCanApplyCtx's predColRefs list to check later + //if index keys contain all filter predicate columns and vice-a-versa + for (String col : colList) { + canApplyCtx.getPredicateColumnsList().add(col); + } + + return null; + } + } + + public static CheckFilterProc canApplyOnFilterOperator() { + return new CheckFilterProc(); + } + + + + /** + * Check for conditions in GroupByOperator that do not meet rewrite criteria. + * Set the appropriate variables in {@link RewriteVars} enum. + * + */ + private static class CheckGroupByProc implements NodeProcessor { + + public Object process(Node nd, Stack stack, NodeProcessorCtx ctx, + Object... nodeOutputs) throws SemanticException { + GroupByOperator operator = (GroupByOperator)nd; + canApplyCtx = (RewriteCanApplyCtx)ctx; + //for each group-by clause in query, only one GroupByOperator of the GBY-RS-GBY sequence is stored in getGroupOpToInputTables + //we need to process only this operator + //Also, we do not rewrite for cases when same query branch has multiple group-by constructs + if(canApplyCtx.getParseContext().getGroupOpToInputTables().containsKey(operator) && + canApplyCtx.query_has_group_by == false ){ + + canApplyCtx.query_has_group_by = true; + GroupByDesc conf = (GroupByDesc) operator.getConf(); + ArrayList aggrList = conf.getAggregators(); + if(aggrList != null && aggrList.size() > 0){ + for (AggregationDesc aggregationDesc : aggrList) { + int aggCnt = canApplyCtx.getAggFuncCnt(); + canApplyCtx.agg_func_cnt = aggCnt + 1; + canApplyCtx.setAggFuncCnt(aggCnt + 1); + //In the current implementation, we do not support more than 1 agg funcs in group-by + if(canApplyCtx.agg_func_cnt > 1) { + return false; + } + String aggFunc = aggregationDesc.getGenericUDAFName(); + if(!aggFunc.equals("count")){ + canApplyCtx.agg_func_is_not_count = true; + }else{ + ArrayList para = aggregationDesc.getParameters(); + //for a valid aggregation, it needs to have non-null parameter list + if(para == null){ + canApplyCtx.agg_func_cols_fetch_exception = true; + }else if(para.size() == 0){ + //count(*) case + canApplyCtx.count_on_all_cols = true; + }else{ + for(int i=0; i< para.size(); i++){ + ExprNodeDesc expr = para.get(i); + if(expr instanceof ExprNodeColumnDesc){ + //Add the columns to RewriteCanApplyCtx's selectColumnsList list to check later + //if index keys contain all select clause columns and vice-a-versa + //we get the select column 'actual' names only here if we have a agg func along with group-by + //SelectOperator has internal names in its colList data structure + canApplyCtx.getSelectColumnsList().add(((ExprNodeColumnDesc) expr).getColumn()); + //Add the columns to RewriteCanApplyCtx's aggFuncColList list to check later + //if columns contained in agg func are index key columns + canApplyCtx.getAggFuncColList().add(((ExprNodeColumnDesc) expr).getColumn()); + } + } + } + } + } + }else{ + //if group-by does not have aggregation list, then it "might" be a DISTINCT case + //this code uses query block to determine if the ASTNode tree contains the distinct TOK_SELECTDI token + QBParseInfo qbParseInfo = canApplyCtx.getParseContext().getQB().getParseInfo(); + Set clauseNameSet = qbParseInfo.getClauseNames(); + if (clauseNameSet.size() == 1) { + Iterator clauseNameIter = clauseNameSet.iterator(); + String clauseName = clauseNameIter.next(); + ASTNode rootSelExpr = qbParseInfo.getSelForClause(clauseName); + boolean isDistinct = (rootSelExpr.getType() == HiveParser.TOK_SELECTDI); + if(isDistinct) { + canApplyCtx.query_has_distinct = true; + } + } + } + + //we need to have non-null group-by keys for a valid group-by operator + ArrayList keyList = conf.getKeys(); + if(keyList == null || keyList.size() == 0){ + canApplyCtx.gby_keys_fetch_exception = true; + } + //sets the no. of keys in group by to be used later to determine if group-by has non-index cols + //group-by needs to be preserved in such cases (eg.group-by using a function on index key. This is the subquery append case) + canApplyCtx.gby_key_cnt = keyList.size(); + for (ExprNodeDesc expr : keyList) { + checkExpression(expr); + } + + } + + return null; + } + + private void checkExpression(ExprNodeDesc expr){ + if(expr instanceof ExprNodeColumnDesc){ + //Add the group-by keys to RewriteCanApplyCtx's gbKeyNameList list to check later + //if all keys are from index columns + canApplyCtx.getGbKeyNameList().addAll(expr.getCols()); + }else if(expr instanceof ExprNodeGenericFuncDesc){ + ExprNodeGenericFuncDesc funcExpr = (ExprNodeGenericFuncDesc)expr; + List childExprs = funcExpr.getChildExprs(); + for (ExprNodeDesc childExpr : childExprs) { + if(childExpr instanceof ExprNodeColumnDesc){ + //Set QUERY_HAS_GENERICUDF_ON_GROUPBY_KEY to true which is used later to determine + //whether the rewrite is a 'append subquery' case + //this is true in case the group-by key is a GenericUDF like year,month etc + canApplyCtx.query_has_generic_udf_on_groupby_key = true; + canApplyCtx.getGbKeyNameList().addAll(expr.getCols()); + canApplyCtx.getSelectColumnsList().add(((ExprNodeColumnDesc) childExpr).getColumn()); + }else if(childExpr instanceof ExprNodeGenericFuncDesc){ + checkExpression(childExpr); + } + } + } + } + } + + + public static CheckGroupByProc canApplyOnGroupByOperator() { + return new CheckGroupByProc(); + } + + + /** + * Check for conditions in ExtractOperator that do not meet rewrite criteria. + * Set the appropriate variables in {@link RewriteVars} enum. + * + */ + private static class CheckExtractProc implements NodeProcessor { + public Object process(Node nd, Stack stack, NodeProcessorCtx ctx, + Object... nodeOutputs) throws SemanticException { + ExtractOperator operator = (ExtractOperator)nd; + canApplyCtx = (RewriteCanApplyCtx)ctx; + //We get the information whether query has SORT BY, ORDER BY, DISTRIBUTE BY from + //the parent ReduceSinkOperator of the current ExtractOperator + if(operator.getParentOperators() != null && operator.getParentOperators().size() >0){ + Operator interim = operator.getParentOperators().get(0); + if(interim instanceof ReduceSinkOperator){ + ReduceSinkDesc conf = (ReduceSinkDesc) interim.getConf(); + ArrayList partCols = conf.getPartitionCols(); + int nr = conf.getNumReducers(); + if(nr == -1){ + if(partCols != null && partCols.size() > 0){ + //query has distribute-by is there are non-zero partition columns + canApplyCtx.query_has_distribute_by = true; + }else{ + //we do not need partition columns in case of sort-by + canApplyCtx.query_has_sort_by = true; + } + }else if(nr == 1){ + //Query has order-by only if number of reducers is 1 + canApplyCtx.query_has_order_by = true; + } + + } + } + + return null; + } + } + + public static CheckExtractProc canApplyOnExtractOperator() { + return new CheckExtractProc(); + } + + /** + * Check for conditions in SelectOperator that do not meet rewrite criteria. + * Set the appropriate variables in {@link RewriteVars} enum. + * + */ + private static class CheckSelectProc implements NodeProcessor { + public Object process(Node nd, Stack stack, NodeProcessorCtx ctx, + Object... nodeOutputs) throws SemanticException { + SelectOperator operator = (SelectOperator)nd; + canApplyCtx = (RewriteCanApplyCtx)ctx; + + List> childrenList = operator.getChildOperators(); + Operator child = childrenList.get(0); + if(child instanceof FileSinkOperator){ + Map internalToAlias = new LinkedHashMap(); + RowSchema rs = operator.getSchema(); + //to get the internal to alias mapping + ArrayList sign = rs.getSignature(); + for (ColumnInfo columnInfo : sign) { + internalToAlias.put(columnInfo.getInternalName(), columnInfo.getAlias()); + } + + //if FilterOperator predicate has internal column names, we need to retrieve the 'actual' column names to + //check if index keys contain all filter predicate columns and vice-a-versa + Iterator predItr = canApplyCtx.getPredicateColumnsList().iterator(); + while(predItr.hasNext()){ + String predCol = predItr.next(); + String newPredCol = ""; + if(internalToAlias.get(predCol) != null){ + newPredCol = internalToAlias.get(predCol); + canApplyCtx.getPredicateColumnsList().remove(predCol); + canApplyCtx.getPredicateColumnsList().add(newPredCol); + } + } + } + return null; + } + } + + public static CheckSelectProc canApplyOnSelectOperator() { + return new CheckSelectProc(); + } + +} diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/index/RewriteGBUsingIndex.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/index/RewriteGBUsingIndex.java new file mode 100644 index 0000000..8de7ece --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/index/RewriteGBUsingIndex.java @@ -0,0 +1,520 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.optimizer.index; + +import java.io.Serializable; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.LinkedHashMap; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.metastore.api.FieldSchema; +import org.apache.hadoop.hive.metastore.api.Index; +import org.apache.hadoop.hive.metastore.api.StorageDescriptor; +import org.apache.hadoop.hive.ql.exec.Operator; +import org.apache.hadoop.hive.ql.exec.TableScanOperator; +import org.apache.hadoop.hive.ql.metadata.Hive; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.metadata.Table; +import org.apache.hadoop.hive.ql.optimizer.RewriteParseContextGenerator; +import org.apache.hadoop.hive.ql.optimizer.Transform; +import org.apache.hadoop.hive.ql.parse.ParseContext; +import org.apache.hadoop.hive.ql.parse.QBParseInfo; +import org.apache.hadoop.hive.ql.parse.SemanticException; + + +/** + * RewriteGBUsingIndex is implemented as one of the Rule-based Optimizations. + * Implements optimizations for GroupBy clause rewrite using aggregate index. + * This optimization rewrites GroupBy query over base table to the query over simple table-scan over + * index table, if there is index on the group by key(s) or the distinct column(s). + * E.g. + * + * select key + * from table + * group by key; + * + * to + * + * select key + * from idx_table; + * + * + * The rewrite supports following queries + * - Queries having only those col refs that are in the index key. + * - Queries that have index key col refs + * - in SELECT + * - in WHERE + * - in GROUP BY + * - Queries with agg func COUNT(literal) or COUNT(index key col ref) + * in SELECT + * - Queries with SELECT DISTINCT index_key_col_refs + * - Queries having a subquery satisfying above condition (only the + * subquery is rewritten) + * + * FUTURE: + * - Many of the checks for above criteria rely on equivalence of expressions, + * but such framework/mechanism of expression equivalence isn't present currently or developed yet. + * This needs to be supported in order for better robust checks. This is critically important for + * correctness of a query rewrite system. + * + * + * @see RewriteCanApplyCtx + * @see RewriteCanApplyProcFactory + * @see RewriteRemoveGroupbyCtx + * @see RewriteRemoveGroupbyProcFactory + * @see RewriteIndexSubqueryCtx + * @see RewriteIndexSubqueryProcFactory + * @see RewriteParseContextGenerator + * + */ +public class RewriteGBUsingIndex implements Transform { + private ParseContext parseContext; + private Hive hiveDb; + private HiveConf hiveConf; + protected final Log LOG = LogFactory.getLog(this.getClass().getName()); + + //Stores the list of top TableScanOperator names for which the rewrite can be applied and the action that needs to be performed for operator tree + //starting from this TableScanOperator + private final Map tsOpToProcess = new LinkedHashMap(); + + //Name of the current table on which rewrite is being performed + private String baseTableName = null; + private String indexTableName = null; + + /***************************************Index Validation Variables***************************************/ + final String SUPPORTED_INDEX_TYPE = + "org.apache.hadoop.hive.ql.index.AggregateIndexHandler"; + final String IDX_BUCKET_COL = "_bucketname"; + final String IDX_OFFSETS_ARRAY_COL = "_offsets"; + final String IDX_COUNT_KEY_COL = "_countkey"; + final String IDX_COUNT_ALL_COL = "_countall"; + + @Override + public ParseContext transform(ParseContext pctx) throws SemanticException { + parseContext = pctx; + hiveConf = parseContext.getConf(); + try { + hiveDb = Hive.get(hiveConf); + } catch (HiveException e) { + LOG.debug("Exception in getting hive conf", e); + e.printStackTrace(); + } + + + /* Check if the input query is internal query that inserts in table (eg. ALTER INDEX...REBUILD etc.) + * We do not apply optimization here. + * */ + if(isQueryInsertToTable()){ + return parseContext; + }else{ + /* Check if the input query passes all the tests to be eligible for a rewrite + * If yes, rewrite original query; else, return the current parseContext + * */ + if(shouldApplyOptimization()){ + LOG.info("Rewriting Original Query."); + rewriteOriginalQuery(); + } + return parseContext; + } + + } + + /** + * Use Query block's parse {@link QBParseInfo} information to check if the input query + * is an internal SQL. + * If it is true, we do not apply this optimization. + * @return + */ + private boolean isQueryInsertToTable(){ + QBParseInfo qbParseInfo = parseContext.getQB().getParseInfo(); + return qbParseInfo.isInsertToTable(); + } + + /** + * We traverse the current operator tree to check for conditions in which the + * optimization cannot be applied. + * + * At the end, we check if all conditions have passed for rewrite. If yes, we + * determine if the the index is usable for rewrite. Else, we log the condition which + * did not meet the rewrite criterion. + * + * @return + * @throws SemanticException + */ + boolean shouldApplyOptimization() throws SemanticException{ + boolean canApply = false; + if(ifQueryHasMultipleTables()){ + //We do not apply this optimization for this case as of now. + return false; + }else{ + /* + * This code iterates over each TableScanOperator from the topOps map from ParseContext. + * For each operator tree originating from this top TableScanOperator, we determine + * if the optimization can be applied. If yes, we add the name of the top table to + * the tsOpToProcess to apply rewrite later on. + * */ + HashMap topToTable = parseContext.getTopToTable(); + HashMap> topOps = parseContext.getTopOps(); + Iterator topOpItr = topToTable.keySet().iterator(); + while(topOpItr.hasNext()){ + //Context for checking if this optimization can be applied to the input query + RewriteCanApplyCtx canApplyCtx = RewriteCanApplyCtx.getInstance(parseContext); + + TableScanOperator topOp = topOpItr.next(); + Table table = topToTable.get(topOp); + baseTableName = table.getTableName(); + HashMap> indexTableMap = getIndexTableInfoForRewrite(topOp); + + if(indexTableMap != null){ + if(indexTableMap.size() == 0){ + LOG.debug("No Valid Index Found to apply Rewrite, " + + "skipping " + getName() + " optimization" ); + return false; + } else if(indexTableMap.size() > 1){ + // a cost-based analysis can be done here to choose a single index table amongst all valid indexes to apply rewrite + // we leave this decision-making + LOG.debug("Table has multiple valid index tables to apply rewrite, skipping " + getName() + " optimization" ); + return false; + }else{ + canApplyCtx.setBaseTableName(baseTableName); + canApplyCtx.populateRewriteVars(topOp); + + Iterator indexMapItr = indexTableMap.keySet().iterator(); + Index index = null; + while(indexMapItr.hasNext()){ + //we rewrite the original query using the first valid index encountered + //this can be changed if we have a better mechanism to decide which index will produce a better rewrite + index = indexMapItr.next(); + canApply = canApplyCtx.isIndexUsableForQueryBranchRewrite(index, indexTableMap.get(index)); + if(canApply){ + canApply = checkIfAllRewriteCriteriaIsMet(canApplyCtx); + break; + } + } + indexTableName = index.getIndexTableName(); + + if(canApply && topOps.containsValue(topOp)) { + Iterator topOpNamesItr = topOps.keySet().iterator(); + while(topOpNamesItr.hasNext()){ + String topOpName = topOpNamesItr.next(); + if(topOps.get(topOpName).equals(topOp)){ + tsOpToProcess.put(topOpName, canApplyCtx); + } + } + } + } + } + } + } + return canApply; + } + + + /** + * Method to rewrite the input query if all optimization criteria is passed. + * The method iterates over the tsOpToProcess {@link ArrayList} to apply the rewrites + * + * @throws SemanticException + */ + private void rewriteOriginalQuery() { + HashMap> topOpMap = parseContext.getTopOps(); + Iterator tsOpItr = tsOpToProcess.keySet().iterator(); + while(tsOpItr.hasNext()){ + baseTableName = tsOpItr.next(); + RewriteCanApplyCtx canApplyCtx = tsOpToProcess.get(baseTableName); + TableScanOperator topOp = (TableScanOperator) topOpMap.get(baseTableName); + + /* This part of the code checks if the 'REMOVE_GROUP_BY' value in RewriteVars enum is set to true. + * If yes, it sets the environment for the RewriteRemoveGroupbyCtx context and invokes + * method to apply rewrite by removing group by construct operators from the original operator tree. + * */ + if(canApplyCtx.remove_group_by){ + try { + //Context for removing the group by construct operators from the operator tree + RewriteRemoveGroupbyCtx removeGbyCtx = RewriteRemoveGroupbyCtx.getInstance(parseContext, hiveDb, indexTableName); + removeGbyCtx.invokeRemoveGbyProc(topOp); + parseContext = removeGbyCtx.getParseContext(); + parseContext.setOpParseCtx(removeGbyCtx.getOpc()); + LOG.info("Finished Group by Remove"); + } catch (SemanticException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + LOG.info("Exception in rewriting original query while using GB-to-IDX optimizer."); + } + //Getting back new parseContext and new OpParseContext after GBY-RS-GBY is removed + } + + /* This part of the code checks if the 'SHOULD_APPEND_SUBQUERY' value in RewriteVars enum is set to true. + * If yes, it sets the environment for the RewriteIndexSubqueryCtx context and invokes + * method to append a new subquery that scans over the index table rather than the original table. + * We first create the subquery context, then copy the RowSchema/RowResolver from subquery to original operator tree. + * */ + if(canApplyCtx.should_append_subquery){ + try { + //Context for appending a subquery to scan over the index table + RewriteIndexSubqueryCtx subqueryCtx = RewriteIndexSubqueryCtx.getInstance(parseContext, indexTableName, baseTableName, + canApplyCtx.getSelectColumnsList()); + subqueryCtx.createSubqueryContext(); + + HashMap subqTopOpMap = subqueryCtx.getSubqueryPctx().getTopToTable(); + Iterator subqTopOpItr = subqTopOpMap.keySet().iterator(); + TableScanOperator subqTopOp = null; + if(subqTopOpItr.hasNext()){ + subqTopOp = subqTopOpItr.next(); + subqueryCtx.invokeSubquerySelectSchemaProc(subqTopOp); + LOG.info("Finished Fetching subquery select schema"); + subqueryCtx.invokeFixAllOperatorSchemasProc(topOp); + parseContext = subqueryCtx.getParseContext(); + } + + LOG.info("Finished appending subquery"); + } catch (SemanticException e) { + LOG.debug("Exception in rewriting original query while using GB-to-IDX optimizer.", e); + } + } + } + + LOG.info("Finished Rewriting query"); + + } + + private String getName() { + return "RewriteGBUsingIndex"; + } + + + /** + * This method logs the reason for which we cannot apply the rewrite optimization. + * @return + */ + boolean checkIfAllRewriteCriteriaIsMet(RewriteCanApplyCtx canApplyCtx){ + if (canApplyCtx.query_has_distribute_by){ + LOG.debug("Query has distributeby clause, " + + "that is not supported with " + getName() + " optimization" ); + return false; + } + if (canApplyCtx.query_has_sort_by){ + LOG.debug("Query has sortby clause, " + + "that is not supported with " + getName() + " optimization" ); + return false; + } + if (canApplyCtx.query_has_order_by){ + LOG.debug("Query has orderby clause, " + + "that is not supported with " + getName() + " optimization" ); + return false; + } + if (canApplyCtx.agg_func_cnt > 1 ){ + LOG.debug("More than 1 agg funcs: " + + "Not supported by " + getName() + " optimization" ); + return false; + } + if (canApplyCtx.agg_func_is_not_count){ + LOG.debug("Agg func other than count is " + + "not supported by " + getName() + " optimization" ); + return false; + } + if (canApplyCtx.count_on_all_cols){ + LOG.debug("Currently count function needs group by on key columns. This is a count(*) case., " + + "Cannot apply this " + getName() + " optimization" ); + return false; + } + if (canApplyCtx.agg_func_cols_fetch_exception){ + LOG.debug("Got exception while locating child col refs " + + "of agg func, skipping " + getName() + " optimization" ); + return false; + } + if (canApplyCtx.whr_clause_cols_fetch_exception){ + LOG.debug("Got exception while locating child col refs for where clause, " + + "skipping " + getName() + " optimization" ); + return false; + } + if (canApplyCtx.sel_clause_cols_fetch_exception){ + LOG.debug("Got exception while locating child col refs for select list, " + + "skipping " + getName() + " optimization" ); + return false; + } + if (canApplyCtx.gby_keys_fetch_exception){ + LOG.debug("Got exception while locating child col refs for GroupBy key, " + + "skipping " + getName() + " optimization" ); + return false; + } + return true; + } + + + + /** + * This block of code iterates over the topToTable map from ParseContext + * to determine if the query has a scan over multiple tables. + * @return + */ + boolean ifQueryHasMultipleTables(){ + HashMap topToTable = parseContext.getTopToTable(); + Iterator valuesItr = topToTable.values().iterator(); + Set tableNameSet = new HashSet(); + while(valuesItr.hasNext()){ + Table table = valuesItr.next(); + tableNameSet.add(table.getTableName()); + } + if(tableNameSet.size() > 1){ + LOG.debug("Query has more than one table " + + "that is not supported with " + getName() + " optimization" ); + return true; + } + return false; + } + + + /** + * Given a base table meta data, and a list of index types for which we need to find a matching index, + * this method returns a list of matching index tables. + * @param baseTableMetaData + * @param matchIndexTypes + * @return + * @throws SemanticException + */ + List getIndexes(Table baseTableMetaData, List matchIndexTypes) throws SemanticException { + List matchingIndexes = new ArrayList(); + List indexesOnTable = null; + + try { + //this limit parameter is required by metastore API's and acts as a check + // to avoid huge payloads coming back from thrift + short maxNumOfIndexes = 1024; + indexesOnTable = baseTableMetaData.getAllIndexes(maxNumOfIndexes); + + } catch (HiveException e) { + LOG.error("Could not retrieve indexes on the base table. Check logs for error."); + throw new SemanticException(e.getMessage(), e); + } + + for (int i = 0; i < indexesOnTable.size(); i++) { + Index index = null; + index = indexesOnTable.get(i); + // The handler class implies the type of the index (e.g. compact + // summary index would be: + // "org.apache.hadoop.hive.ql.index.compact.CompactIndexHandler"). + String indexType = index.getIndexHandlerClass(); + for (int j = 0; j < matchIndexTypes.size(); j++) { + if (indexType.equals(matchIndexTypes.get(j))) { + matchingIndexes.add(index); + break; + } + } + } + return matchingIndexes; + } + + + /** + * We retrieve the list of index tables on the current table (represented by the TableScanOperator) + * which can be used to apply rewrite on the original query + * and return if there are no index tables to be used for rewriting the input query. + * + * @param topOp + * @return + * @throws SemanticException + */ + HashMap> getIndexTableInfoForRewrite(TableScanOperator topOp) throws SemanticException { + HashMap> indexTableMap = null; + TableScanOperator ts = (TableScanOperator) topOp; + Table tsTable = parseContext.getTopToTable().get(ts); + if (tsTable != null) { + List idxType = new ArrayList(); + idxType.add(SUPPORTED_INDEX_TYPE); + List indexTables = getIndexes(tsTable, idxType); + if (indexTables == null || indexTables.size() == 0) { + LOG.debug("Table " + baseTableName + " does not have aggregate index. " + + "Cannot apply " + getName() + " optimization" ); + }else{ + indexTableMap = populateIndexToKeysMap(indexTables); + } + } + return indexTableMap; + } + + + /** + * This code block iterates over indexes on the table and picks + * up the first index that satisfies the rewrite criteria. + * @param indexTables + * @return + */ + HashMap> populateIndexToKeysMap(List indexTables){ + Index index = null; + Hive hiveInstance = hiveDb; + HashMap> indexToKeysMap = new LinkedHashMap>(); + + for (int idxCtr = 0; idxCtr < indexTables.size(); idxCtr++) { + final Set indexKeyNames = new LinkedHashSet(); + index = indexTables.get(idxCtr); + + //Getting index key columns + StorageDescriptor sd = index.getSd(); + List idxColList = sd.getCols(); + for (FieldSchema fieldSchema : idxColList) { + indexKeyNames.add(fieldSchema.getName()); + } + + + // Check that the index schema is as expected. This code block should + // catch problems of this rewrite breaking when the AggregateIndexHandler + // index is changed. + // This dependency could be better handled by doing init-time check for + // compatibility instead of this overhead for every rewrite invocation. + ArrayList idxTblColNames = new ArrayList(); + try { + Table idxTbl = hiveInstance.getTable(index.getDbName(), + index.getIndexTableName()); + for (FieldSchema idxTblCol : idxTbl.getCols()) { + idxTblColNames.add(idxTblCol.getName()); + } + } catch (HiveException e) { + LOG.debug("Got exception while locating index table, " + + "skipping " + getName() + " optimization" ); + return indexToKeysMap; + } + assert(idxTblColNames.contains(IDX_BUCKET_COL)); + assert(idxTblColNames.contains(IDX_OFFSETS_ARRAY_COL)); + assert(idxTblColNames.contains(IDX_COUNT_KEY_COL)); + assert(idxTblColNames.contains(IDX_COUNT_ALL_COL)); + assert(idxTblColNames.size() == indexKeyNames.size() + 4); + + //we add all index tables which can be used for rewrite and defer the decision of using a particular index for later + //this is to allow choosing a index if a better mechanism is designed later to chose a better rewrite + indexToKeysMap.put(index, indexKeyNames); + } + return indexToKeysMap; + + } + + + + +} + diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/index/RewriteIndexSubqueryCtx.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/index/RewriteIndexSubqueryCtx.java new file mode 100644 index 0000000..41142ba --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/index/RewriteIndexSubqueryCtx.java @@ -0,0 +1,317 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.optimizer.index; + +import java.io.Serializable; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.Stack; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hive.ql.exec.ColumnInfo; +import org.apache.hadoop.hive.ql.exec.Operator; +import org.apache.hadoop.hive.ql.lib.DefaultGraphWalker; +import org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher; +import org.apache.hadoop.hive.ql.lib.Dispatcher; +import org.apache.hadoop.hive.ql.lib.GraphWalker; +import org.apache.hadoop.hive.ql.lib.Node; +import org.apache.hadoop.hive.ql.lib.NodeProcessor; +import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx; +import org.apache.hadoop.hive.ql.lib.PreOrderWalker; +import org.apache.hadoop.hive.ql.lib.Rule; +import org.apache.hadoop.hive.ql.lib.RuleRegExp; +import org.apache.hadoop.hive.ql.optimizer.RewriteParseContextGenerator; +import org.apache.hadoop.hive.ql.parse.ParseContext; +import org.apache.hadoop.hive.ql.parse.RowResolver; +import org.apache.hadoop.hive.ql.parse.SemanticException; +import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator; + +/** + * RewriteIndexSubqueryCtx class stores the context for the {@link RewriteIndexSubqueryProcFactory} processor factory methods + * + */ +public class RewriteIndexSubqueryCtx implements NodeProcessorCtx { + + private RewriteIndexSubqueryCtx(ParseContext parseContext, String indexTableName, + String baseTableName, Set selectColumnNames){ + //this prevents the class from getting instantiated + this.parseContext = parseContext; + this.indexName = indexTableName; + this.baseTableName = baseTableName; + this.selectColumnNames = selectColumnNames; + } + + public static RewriteIndexSubqueryCtx getInstance(ParseContext parseContext, String indexTableName, + String baseTableName, Set selectColumnNames){ + return new RewriteIndexSubqueryCtx(parseContext, indexTableName, baseTableName, selectColumnNames ); + } + protected final Log LOG = LogFactory.getLog(RewriteIndexSubqueryCtx.class.getName()); + + //This is populated in RewriteIndexSubqueryProcFactory's NewQuerySelectSchemaProc processor with the colExprMap of the + //SelectOperator whose parent is TableScanOperator + private Map newSelColExprMap = new LinkedHashMap(); + //The next two data structures are populated in RewriteIndexSubqueryProcFactory's NewQuerySelectSchemaProc processor + //with the colExprMap of the SelectOperator whose child is GroupByOperator + private final ArrayList newSelColList = new ArrayList(); + + // Initialise all data structures required to copy RowResolver, RowSchema, outputColumnNames, colList, colExprMap + //from subquery DAG to original DAG operators + private final ArrayList newOutputCols = new ArrayList(); + private Map newColExprMap = new HashMap(); + private final ArrayList newColList = new ArrayList(); + private final ArrayList newRS = new ArrayList(); + private RowResolver newRR = new RowResolver(); + + //This is populated in RewriteIndexSubqueryProcFactory's SubquerySelectSchemaProc processor for later + //use in NewQuerySelectSchemaProc processor + private final Map aliasToInternal = new LinkedHashMap(); + + // Get the parentOperators List for FileSinkOperator. We need this later to set the + // parentOperators for original DAG operator + private final List> subqFSParentList = new ArrayList>(); + + // We need the reference to this SelectOperator so that the original DAG can be appended here + private Operator subqSelectOp; + + //We replace the original TS operator with new TS operator from subquery context to scan over the index table + //rather than the original table + private Operator newTSOp; + + private final ParseContext parseContext; + private final Set selectColumnNames; + private final String indexName; + private final String baseTableName; + + private ParseContext subqueryPctx = null; + private ParseContext newDAGCtx = null; + + //We need the GenericUDAFEvaluator for GenericUDAF function "sum" when we append subquery to original operator tree + private GenericUDAFEvaluator eval = null; + + + public Set getSelectColumnNames() { + return selectColumnNames; + } + + public ArrayList getNewOutputCols() { + return newOutputCols; + } + + public Map getNewColExprMap() { + return newColExprMap; + } + + public void setNewColExprMap(Map newColExprMap) { + this.newColExprMap = newColExprMap; + } + + public ArrayList getNewColList() { + return newColList; + } + + public ArrayList getNewRS() { + return newRS; + } + + public RowResolver getNewRR() { + return newRR; + } + + public void setNewRR(RowResolver newRR) { + this.newRR = newRR; + } + + public List> getSubqFSParentList() { + return subqFSParentList; + } + + public Operator getSubqSelectOp() { + return subqSelectOp; + } + + public void setSubqSelectOp(Operator subqSelectOp) { + this.subqSelectOp = subqSelectOp; + } + + public Map getAliasToInternal() { + return aliasToInternal; + } + + public ParseContext getParseContext() { + return parseContext; + } + + public ParseContext getSubqueryPctx() { + return subqueryPctx; + } + + public void setSubqueryPctx(ParseContext subqueryPctx) { + this.subqueryPctx = subqueryPctx; + } + + public ParseContext getNewDAGCtx() { + return newDAGCtx; + } + + public void setNewDAGCtx(ParseContext newDAGCtx) { + this.newDAGCtx = newDAGCtx; + } + + public Map getNewSelColExprMap() { + return newSelColExprMap; + } + + public void setNewSelColExprMap(Map newSelColExprMap) { + this.newSelColExprMap = newSelColExprMap; + } + + public ArrayList getNewSelColList() { + return newSelColList; + } + + public String getIndexName() { + return indexName; + } + + public String getBaseTableName() { + return baseTableName; + } + + public GenericUDAFEvaluator getEval() { + return eval; + } + + public void setEval(GenericUDAFEvaluator eval) { + this.eval = eval; + } + + + public void setNewTSOp(Operator newTSOp) { + this.newTSOp = newTSOp; + } + + public Operator getNewTSOp() { + return newTSOp; + } + + /** + * We construct the string command for subquery using index key columns + * and use the {@link RewriteParseContextGenerator} to generate a operator tree + * and its ParseContext for the subquery string command + */ + void createSubqueryContext() { + String selKeys = ""; + for (String key : selectColumnNames) { + selKeys += key + ","; + } + String subqueryCommand = "select " + selKeys + " `_countkey` as CNT from " + indexName; + subqueryPctx = RewriteParseContextGenerator.generateOperatorTree(parseContext.getConf(), subqueryCommand); + + } + + /** + * Walk the original operator tree using the {@link DefaultGraphWalker} using the rules. + * Each of the rules invoke respective methods from the {@link RewriteIndexSubqueryProcFactory} + * to + * @param topOp + * @throws SemanticException + */ + public void invokeSubquerySelectSchemaProc(Operator topOp) throws SemanticException{ + Map opRules = new LinkedHashMap(); + //removes the subquery FileSinkOperator from subquery OpParseContext as + //we do not need to append FS operator to original operator tree + opRules.put(new RuleRegExp("R1", "FS%"), RewriteIndexSubqueryProcFactory.getSubqueryFileSinkProc()); + //copies the RowSchema, outputColumnNames, colList, RowResolver, columnExprMap to RewriteIndexSubqueryCtx data structures + opRules.put(new RuleRegExp("R2", "SEL%"), RewriteIndexSubqueryProcFactory.getSubquerySelectSchemaProc()); + + // The dispatcher fires the processor corresponding to the closest matching + // rule and passes the context along + Dispatcher disp = new DefaultRuleDispatcher(getDefaultProc(), opRules, this); + GraphWalker ogw = new DefaultGraphWalker(disp); + + // Create a list of topop nodes + ArrayList topNodes = new ArrayList(); + topNodes.add(topOp); + ogw.startWalking(topNodes, null); + + } + + + + /** + * Walk the original operator tree using the {@link PreOrderWalker} using the rules. + * This method appends the subquery operator tree to original operator tree + * It replaces the original table scan operator with index table scan operator + * Method also copies the information from {@link RewriteIndexSubqueryCtx} to + * appropriate operators from the original operator tree + * @param topOp + * @throws SemanticException + */ + public void invokeFixAllOperatorSchemasProc(Operator topOp) throws SemanticException{ + Map opRules = new LinkedHashMap(); + + //appends subquery operator tree to original operator tree + opRules.put(new RuleRegExp("R1", "TS%"), RewriteIndexSubqueryProcFactory.getAppendSubqueryToOriginalQueryProc()); + + //copies RowSchema, outputColumnNames, colList, RowResolver, columnExprMap from RewriteIndexSubqueryCtx data structures + // to SelectOperator of original operator tree + opRules.put(new RuleRegExp("R2", "SEL%"), RewriteIndexSubqueryProcFactory.getNewQuerySelectSchemaProc()); + //Manipulates the ExprNodeDesc from FilterOperator predicate list as per colList data structure from RewriteIndexSubqueryCtx + opRules.put(new RuleRegExp("R3", "FIL%"), RewriteIndexSubqueryProcFactory.getNewQueryFilterSchemaProc()); + //Manipulates the ExprNodeDesc from GroupByOperator aggregation list, parameters list \ + //as per colList data structure from RewriteIndexSubqueryCtx + opRules.put(new RuleRegExp("R4", "GBY%"), RewriteIndexSubqueryProcFactory.getNewQueryGroupbySchemaProc()); + + // The dispatcher fires the processor corresponding to the closest matching + // rule and passes the context along + Dispatcher disp = new DefaultRuleDispatcher(getDefaultProc(), opRules, this); + GraphWalker ogw = new PreOrderWalker(disp); + + // Create a list of topop nodes + ArrayList topNodes = new ArrayList(); + topNodes.add(topOp); + + ogw.startWalking(topNodes, null); + + } + + + /** + * Default procedure for {@link DefaultRuleDispatcher} + * @return + */ + private NodeProcessor getDefaultProc() { + return new NodeProcessor() { + @Override + public Object process(Node nd, Stack stack, + NodeProcessorCtx procCtx, Object... nodeOutputs) throws SemanticException { + return null; + } + }; + } + + + +} diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/index/RewriteIndexSubqueryProcFactory.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/index/RewriteIndexSubqueryProcFactory.java new file mode 100644 index 0000000..7ce1b97 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/index/RewriteIndexSubqueryProcFactory.java @@ -0,0 +1,624 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.optimizer.index; + +import java.io.Serializable; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Iterator; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.Stack; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hive.ql.exec.ColumnInfo; +import org.apache.hadoop.hive.ql.exec.FileSinkOperator; +import org.apache.hadoop.hive.ql.exec.FilterOperator; +import org.apache.hadoop.hive.ql.exec.GroupByOperator; +import org.apache.hadoop.hive.ql.exec.Operator; +import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator; +import org.apache.hadoop.hive.ql.exec.RowSchema; +import org.apache.hadoop.hive.ql.exec.SelectOperator; +import org.apache.hadoop.hive.ql.exec.TableScanOperator; +import org.apache.hadoop.hive.ql.lib.Node; +import org.apache.hadoop.hive.ql.lib.NodeProcessor; +import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx; +import org.apache.hadoop.hive.ql.metadata.Table; +import org.apache.hadoop.hive.ql.optimizer.RewriteParseContextGenerator; +import org.apache.hadoop.hive.ql.parse.ParseContext; +import org.apache.hadoop.hive.ql.parse.RowResolver; +import org.apache.hadoop.hive.ql.parse.SemanticException; +import org.apache.hadoop.hive.ql.plan.AggregationDesc; +import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc; +import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; +import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc; +import org.apache.hadoop.hive.ql.plan.FilterDesc; +import org.apache.hadoop.hive.ql.plan.GroupByDesc; +import org.apache.hadoop.hive.ql.plan.SelectDesc; +import org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; + +/** + * Factory of processors used in {@link RewriteGBUsingIndex} (see invokeSubquerySelectSchemaProc(..) method) + * Each of the processors are invoked according to a rule and serve to append subquery to original operator tree. + * + * This subquery scans over the index table rather than the original table. + * IT replaces the count(literal)/count(index_key) function in the original select operator + * with sum(cnt) where cnt is _countkey from subquery select operator. + * + * This change necessitates change in the rowSchema, colList, colExprMap, rowResolver of all the SelectOperator's in original + * operator tree. It also requires to set appropriate predicate parameters and group-by aggregation parameters in original + * operator tree. Each of the processors in this Factory take care of these changes. + * + */ +public final class RewriteIndexSubqueryProcFactory { + protected final static Log LOG = LogFactory.getLog(RewriteIndexSubqueryProcFactory.class.getName()); + private static RewriteIndexSubqueryCtx subqueryCtx = null; + + private RewriteIndexSubqueryProcFactory() { + //this prevents the class from getting instantiated + } + + /** + * This processor retrieves the rowSchema, rowResolver, colList, colExprMap and outputColumnNames data structures + * from the SelectOperator and its descriptor(SelectDesc). It stores the information in the RewriteIndexSubqueryCtx instance + * for later use in correcting the schema of original operator tree. + * + */ + private static class SubquerySelectSchemaProc implements NodeProcessor { + public Object process(Node nd, Stack stack, NodeProcessorCtx ctx, + Object... nodeOutputs) throws SemanticException { + SelectOperator operator = (SelectOperator)nd; + subqueryCtx = (RewriteIndexSubqueryCtx)ctx; + + //We need to clear this every time in cases where there are multiple operator tree paths with multiple SelectOperators + subqueryCtx.getNewOutputCols().clear(); + subqueryCtx.getNewColExprMap().clear(); + subqueryCtx.getNewColList().clear(); + subqueryCtx.getNewRS().clear(); + subqueryCtx.setNewRR(new RowResolver()); + + + RowResolver oldRR = subqueryCtx.getSubqueryPctx().getOpParseCtx().get(operator).getRowResolver(); + SelectDesc oldConf = (SelectDesc) operator.getConf(); + Map oldColumnExprMap = operator.getColumnExprMap(); + ArrayList oldColList = oldConf.getColList(); + + //We create the mapping of column name alias to internal name for later use in correcting original operator tree + ArrayList schemaSign = operator.getSchema().getSignature(); + for (ColumnInfo columnInfo : schemaSign) { + String internal = columnInfo.getInternalName(); + String alias = columnInfo.getAlias(); + subqueryCtx.getAliasToInternal().put(alias, internal); + } + + /**outputColumnNames**/ + String internalName = null; + for(int i=0; i < oldConf.getOutputColumnNames().size(); i++){ + internalName = oldConf.getOutputColumnNames().get(i); + //Populate all output columns (required by SelectOperators in original DAG) in RewriteIndexSubqueryCtx + subqueryCtx.getNewOutputCols().add(new String(internalName)); + + /**colExprMap**/ + if(oldColumnExprMap != null){ + ExprNodeDesc expr = oldColumnExprMap.get(internalName); //in case of simple column names + if(expr instanceof ExprNodeColumnDesc){ + ExprNodeColumnDesc oldColExpr = (ExprNodeColumnDesc)expr ; + ExprNodeColumnDesc newColExpr = (ExprNodeColumnDesc) oldColExpr.clone(); + newColExpr.setColumn(internalName); + //Populate columnExprMap (required by SelectOperator and FilterOperator in original DAG) in RewriteIndexSubqueryCtx + subqueryCtx.getNewColExprMap().put(internalName, newColExpr); + }else if(expr instanceof ExprNodeGenericFuncDesc){ //in case of functions on columns + ExprNodeGenericFuncDesc oldFuncExpr = (ExprNodeGenericFuncDesc)expr ; + ExprNodeGenericFuncDesc newFuncExpr = (ExprNodeGenericFuncDesc) oldFuncExpr.clone(); + List childExprList = newFuncExpr.getChildExprs(); + List newChildExprList = new ArrayList(); + for (ExprNodeDesc childExpr : childExprList) { //we have the list of columns here + if(childExpr instanceof ExprNodeColumnDesc){ + ((ExprNodeColumnDesc) childExpr).setColumn(internalName); + newChildExprList.add(childExpr); + } + newFuncExpr.setChildExprs(newChildExprList); + //Populate columnExprMap (required by SelectOperator and FilterOperator in original DAG) in RewriteIndexSubqueryCtx + subqueryCtx.getNewColExprMap().put(internalName, newFuncExpr); + } + } + } + + /**colList**/ + if(oldColList != null){ + ExprNodeDesc expr = oldColList.get(i); + if(expr instanceof ExprNodeColumnDesc){//in case of simple column names + ExprNodeColumnDesc newColExpr = (ExprNodeColumnDesc) expr.clone(); + newColExpr.setColumn(internalName); + //Populate colList (required by SelectOperators in original DAG) in RewriteIndexSubqueryCtx + subqueryCtx.getNewColList().add(newColExpr); + }else if(expr instanceof ExprNodeGenericFuncDesc){//in case of functions on columns + ExprNodeGenericFuncDesc oldFuncExpr = (ExprNodeGenericFuncDesc)expr ; + ExprNodeGenericFuncDesc newFuncExpr = (ExprNodeGenericFuncDesc) oldFuncExpr.clone(); + List childExprList = newFuncExpr.getChildExprs(); + List newChildExprList = new ArrayList(); + for (ExprNodeDesc childExpr : childExprList) {//we have the list of columns here + if(childExpr instanceof ExprNodeColumnDesc){ + ((ExprNodeColumnDesc) childExpr).setColumn(internalName); + newChildExprList.add(childExpr); + } + newFuncExpr.setChildExprs(newChildExprList); + //Populate colList (required by SelectOperators in original DAG) in RewriteIndexSubqueryCtx + subqueryCtx.getNewColList().add(newFuncExpr); + } + } + } + } + + /**RowSchema and RowResolver**/ + for (int i = 0; i < subqueryCtx.getNewOutputCols().size(); i++) { + internalName = subqueryCtx.getNewOutputCols().get(i); + String[] nm = oldRR.reverseLookup(internalName); + ColumnInfo col; + try { + //We need to set the alias for the new index table subquery + col = oldRR.get(nm[0], nm[1]); + if(nm[0] == null){ + nm[0] = "v" + i; //add different alias in case original query has multiple subqueries + } + // Populate RowResolver and RowSchema (required by SelectOperator and FilterOperator in original DAG) in RewriteIndexSubqueryCtx + subqueryCtx.getNewRR().put(nm[0], nm[1], col); + subqueryCtx.getNewRS().add(col); + } catch (SemanticException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + //We need this SelectOperator from subquery as a reference point to append in original query + subqueryCtx.setSubqSelectOp(operator); + + return null; + } + } + + public static SubquerySelectSchemaProc getSubquerySelectSchemaProc(){ + return new SubquerySelectSchemaProc(); + } + + + /** + * We do not need the fileSinkOperator of the subquery operator tree when we append the rest of the subquery operator tree + * to the original operator tree. This processor gets rid of this FS operator by removing it from subquery OpParseContext. + * + */ + private static class SubqueryFileSinkProc implements NodeProcessor { + public Object process(Node nd, Stack stack, NodeProcessorCtx ctx, + Object... nodeOutputs) throws SemanticException { + FileSinkOperator operator = (FileSinkOperator)nd; + subqueryCtx = (RewriteIndexSubqueryCtx)ctx; + //Store the list of FileSinkOperator's parent operators as we later append the original query + //at the end of the subquery operator tree (without the FileSinkOperator). + subqueryCtx.getSubqFSParentList().addAll(operator.getParentOperators()); + subqueryCtx.getSubqueryPctx().getOpParseCtx().remove(operator); + return null; + } + } + + public static SubqueryFileSinkProc getSubqueryFileSinkProc(){ + return new SubqueryFileSinkProc(); + } + + /** + * This processor appends the subquery operator tree to the original operator tree. + * Since genPlan(..) method from the SemanticAnalyzer creates the operator tree bottom-up i.e. + * FROM-WHERE-GROUPBY-ORDERBY-SELECT etc, any query with nested subqueries will have the TableScanOperator of the + * innermost subquery as the top operator in the topOps and topToTable maps. + * + * Any subquery which is a part of the from clause + * (eg: SELECT * FROM (SELECT DISTINCT key, value FROM tbl) v1 WHERE v1.value = 2;) always has its + * DAG operator tree appended before the operator tree of the enclosing query. + * For example, for the above query, the operator tree is: + * SEL(1)[subq]--->GBY(2)[subq]--->RS(3)[subq]--->GBY(4)[subq]--->SEL(5)[subq]--->FIL(6)[orig]--->SEL(7)[orig]--->FS(8)[orig]> + * + * We replace the TableScanOperator (TS) of the original operator tree with the whole subquery operator tree (without the + * FileSinkOperator of the subquery operator tree). + * + */ + private static class AppendSubqueryToOriginalQueryProc implements NodeProcessor { + public Object process(Node nd, Stack stack, NodeProcessorCtx ctx, + Object... nodeOutputs) throws SemanticException { + TableScanOperator operator = (TableScanOperator)nd; + subqueryCtx = (RewriteIndexSubqueryCtx)ctx; + List> origChildrenList = operator.getChildOperators(); + + /* origChildrenList has the child operators for the TableScanOperator of the original DAG + * We need to get rid of the TS operator of original DAG and append rest of the tree to the sub-query operator DAG + * This code sets the parentOperators of first operator in origChildrenList to subqFSParentList. + * subqFSParentList contains the parentOperators list of the FileSinkOperator of the sub-query operator DAG + * + * subqLastOp is the last SelectOperator of sub-query DAG. The rest of the original operator DAG needs to be appended here + * Hence, set the subqLastOp's child operators to be origChildrenList + * + * */ + if(origChildrenList != null && origChildrenList.size() > 0){ + origChildrenList.get(0).setParentOperators(subqueryCtx.getSubqFSParentList()); + } + if(subqueryCtx.getSubqSelectOp() != null){ + subqueryCtx.getSubqSelectOp().setChildOperators(origChildrenList); + } + + /* The operator DAG plan is generated in the order FROM-WHERE-GROUPBY-ORDERBY-SELECT + * We have appended the original operator DAG at the end of the sub-query operator DAG + * as the sub-query will always be a part of FROM processing + * Now we need to insert the final sub-query+original DAG to the original ParseContext + */ + + HashMap> subqTopMap = subqueryCtx.getSubqueryPctx().getTopOps(); + Iterator subqTabItr = subqTopMap.keySet().iterator(); + String subqTab = subqTabItr.next(); + Operator subqOp = subqTopMap.get(subqTab); + Table tbl = subqueryCtx.getSubqueryPctx().getTopToTable().get(subqOp); + + //remove original TableScanOperator from the topToTable map + //Put the new TableScanOperator (top operator of the subquery operator tree) to topToTable map + subqueryCtx.getParseContext().getTopToTable().remove(operator); + subqueryCtx.getParseContext().getTopToTable().put((TableScanOperator) subqOp, tbl); + + String tabAlias = ""; + if(subqueryCtx.getBaseTableName().contains(":")){ + String[] tabToAlias = subqueryCtx.getBaseTableName().split(":"); + if(tabToAlias.length > 1){ + tabAlias = tabToAlias[0] + ":"; + } + } + //remove original table and operator tree mapping from topOps + //put the new table alias adn subquery index table as the key and the new operator tree as value in topOps + subqueryCtx.getParseContext().getTopOps().remove(subqueryCtx.getBaseTableName()); + subqueryCtx.getParseContext().getTopOps().put(tabAlias + subqTab, subqOp); + + //we need this later + subqueryCtx.setNewTSOp(subqOp); + + //remove original TableScanOperator from the original OpParsecontext + //add all values from the subquery OpParseContext to the original OpParseContext + subqueryCtx.getParseContext().getOpParseCtx().remove(operator); + subqueryCtx.getParseContext().getOpParseCtx().putAll(subqueryCtx.getSubqueryPctx().getOpParseCtx()); + LOG.info("Finished appending subquery"); + return null; + } + } + + public static AppendSubqueryToOriginalQueryProc getAppendSubqueryToOriginalQueryProc(){ + return new AppendSubqueryToOriginalQueryProc(); + } + + + + /** + * NewQuerySelectSchemaProc. + * + */ + private static class NewQuerySelectSchemaProc implements NodeProcessor { + public Object process(Node nd, Stack stack, NodeProcessorCtx ctx, + Object... nodeOutputs) throws SemanticException { + SelectOperator operator = (SelectOperator)nd; + subqueryCtx = (RewriteIndexSubqueryCtx)ctx; + + List> parentOps = operator.getParentOperators(); + Operator parentOp = parentOps.iterator().next(); + List> childOps = operator.getChildOperators(); + Operator childOp = childOps.iterator().next(); + + + if(parentOp instanceof TableScanOperator){ + //We need to copy the colExprMap of this SelectOperator whose parent is TableScanOperator to the + //colExprMap of the SelectOperator whose child operator is a GroupByOperator + subqueryCtx.setNewSelColExprMap(operator.getColumnExprMap()); + }else if((!(parentOp instanceof TableScanOperator)) //skip first SelectOperator in operator tree + && (!(childOp instanceof FileSinkOperator)) //skip last SelectOperator in operator tree + && (!(childOp instanceof ReduceSinkOperator))){ //skip the SelectOperator which appears before a JOIN in operator tree + + //Copy colList and outputColumns for SelectOperator from sub-query DAG SelectOperator + //these are all the SelectOperators that come in between the first SelectOperator and last SelectOperator in the operator tree + operator.setColumnExprMap(subqueryCtx.getNewColExprMap()); + subqueryCtx.getParseContext().getOpParseCtx().get(operator).setRowResolver(subqueryCtx.getNewRR()); + operator.getSchema().setSignature(subqueryCtx.getNewRS()); + SelectDesc conf = (SelectDesc) operator.getConf(); + conf.setColList(subqueryCtx.getNewColList()); + conf.setOutputColumnNames(subqueryCtx.getNewOutputCols()); + } + + if (childOp instanceof GroupByOperator){ + //use the original columnExprMap to construct the newColList + subqueryCtx.getNewSelColList().clear(); + /**colList**/ + Set internalNamesList = operator.getColumnExprMap().keySet(); + for (String internal : internalNamesList) { + ExprNodeDesc expr = operator.getColumnExprMap().get(internal).clone(); + if(expr instanceof ExprNodeGenericFuncDesc){ + List colExprs = ((ExprNodeGenericFuncDesc)expr).getChildExprs(); + for (ExprNodeDesc colExpr : colExprs) { + if(colExpr instanceof ExprNodeColumnDesc){ + if(!subqueryCtx.getNewSelColList().contains(colExpr)){ + TypeInfo typeInfo = colExpr.getTypeInfo(); + if(typeInfo instanceof ListTypeInfo){ + PrimitiveTypeInfo pti = new PrimitiveTypeInfo(); + pti.setTypeName("int"); + colExpr.setTypeInfo(pti); + } + subqueryCtx.getNewSelColList().add(colExpr); + } + } + } + + }else if(expr instanceof ExprNodeColumnDesc){ + if(!subqueryCtx.getNewSelColList().contains(expr)){ + subqueryCtx.getNewSelColList().add(expr); + } + } + } + //Set the new colExprMap and new colList + operator.setColumnExprMap(subqueryCtx.getNewSelColExprMap()); + SelectDesc selDesc = (SelectDesc) operator.getConf(); + selDesc.setColList(subqueryCtx.getNewSelColList()); + } + + return null; + } + } + + public static NewQuerySelectSchemaProc getNewQuerySelectSchemaProc(){ + return new NewQuerySelectSchemaProc(); + } + + + /** + * We need to replace the count(literal) GenericUDAF aggregation function for group-by construct to "sum" GenericUDAF. + * This processor creates a new operator tree for a sample query that creates a GroupByOperator with sum aggregation function + * and uses that GroupByOperator information to replace the original GroupByOperator aggregation information. + * It replaces the AggregationDesc (aggregation descriptor) of the old GroupByOperator with the new Aggregation Desc + * of the new GroupByOperator. + * + * The processor also corrects the RowSchema and group-by keys by replacing the existing internal names with the new internal names. + * This change is required as we add a new subquery to the original query which triggers this change. + * + */ + private static class NewQueryGroupbySchemaProc implements NodeProcessor { + public Object process(Node nd, Stack stack, NodeProcessorCtx ctx, + Object... nodeOutputs) throws SemanticException { + GroupByOperator operator = (GroupByOperator)nd; + subqueryCtx = (RewriteIndexSubqueryCtx)ctx; + + //We need to replace the GroupByOperator which is in groupOpToInputTables map with the new GroupByOperator + if(subqueryCtx.getParseContext().getGroupOpToInputTables().containsKey(operator)){ + //we need to get rif of the alias and construct a query only with the base table name + String table = subqueryCtx.getBaseTableName(); + if(table.contains(":")){ + String[] aliasAndTab = table.split(":"); + table = aliasAndTab[1]; + } + String selReplacementCommand = ""; + if(subqueryCtx.getSelectColumnNames().iterator().hasNext()){ + //the query contains the sum aggregation GenericUDAF + selReplacementCommand = "select sum(" + subqueryCtx.getSelectColumnNames().iterator().next() + ") as TOTAL from " + table + + " group by " + subqueryCtx.getSelectColumnNames().iterator().next() + " "; + } + //create a new ParseContext for the query to retrieve its operator tree, and the required GroupByOperator from it + ParseContext newDAGContext = RewriteParseContextGenerator.generateOperatorTree(subqueryCtx.getParseContext().getConf(), + selReplacementCommand); + subqueryCtx.setNewDAGCtx(newDAGContext); + + //we get our new GroupByOperator here + Map> newGbyOpMap = subqueryCtx.getNewDAGCtx().getGroupOpToInputTables(); + GroupByOperator newGbyOperator = newGbyOpMap.keySet().iterator().next(); + + //remove the old GroupByOperator + GroupByDesc oldConf = operator.getConf(); + ArrayList oldAggrList = oldConf.getAggregators(); + if(oldAggrList != null && oldAggrList.size() > 0){ + for (AggregationDesc aggregationDesc : oldAggrList) { + if(aggregationDesc != null && aggregationDesc.getGenericUDAFName().equals("count")){ + oldAggrList.remove(aggregationDesc); + break; + } + + } + } + + //Construct the new AggregationDesc to get rid of the current internal names and replace them with new internal names + //as required by the operator tree + GroupByDesc newConf = newGbyOperator.getConf(); + ArrayList newAggrList = newConf.getAggregators(); + if(newAggrList != null && newAggrList.size() > 0){ + for (AggregationDesc aggregationDesc : newAggrList) { + subqueryCtx.setEval(aggregationDesc.getGenericUDAFEvaluator()); + ArrayList paraList = aggregationDesc.getParameters(); + for (int i=0; i< paraList.size(); i++) { + ExprNodeDesc expr = paraList.get(i); + if(expr instanceof ExprNodeColumnDesc){ + ExprNodeColumnDesc colExpr = (ExprNodeColumnDesc)expr; + String col = "cnt"; + if(subqueryCtx.getAliasToInternal().containsKey(col)){ + colExpr.setColumn(subqueryCtx.getAliasToInternal().get(col)); + } + colExpr.setTabAlias(null); + expr = colExpr; + } + paraList.set(i, expr); + } + oldAggrList.add(aggregationDesc); + } + } + + //Construct the new colExprMap to get rid of the current internal names and replace them with new internal names + //as required by the operator tree + Map newGbyColExprMap = new LinkedHashMap(); + Map oldGbyColExprMap = operator.getColumnExprMap(); + Set internalNameSet = oldGbyColExprMap.keySet(); + for (String internal : internalNameSet) { + ExprNodeDesc expr = oldGbyColExprMap.get(internal).clone(); + if(expr instanceof ExprNodeColumnDesc){ + ExprNodeColumnDesc colExpr = (ExprNodeColumnDesc)expr; + String col = colExpr.getColumn(); + if(subqueryCtx.getSelectColumnNames().contains(col)){ + colExpr.setColumn(subqueryCtx.getAliasToInternal().get(col)); + } + }else if(expr instanceof ExprNodeGenericFuncDesc){ + List childExprList = ((ExprNodeGenericFuncDesc)expr).getChildExprs(); + for (ExprNodeDesc childExpr : childExprList) { + if(childExpr instanceof ExprNodeColumnDesc){ + ExprNodeColumnDesc colExpr = (ExprNodeColumnDesc)childExpr; + String col = colExpr.getColumn(); + if(subqueryCtx.getSelectColumnNames().contains(col)){ + colExpr.setColumn(subqueryCtx.getAliasToInternal().get(col)); + } + } + } + + } + newGbyColExprMap.put(internal, expr); + } + + //Construct the new group-by keys to get rid of the current internal names and replace them with new internal names + //as required by the operator tree + ArrayList newGbyKeys = new ArrayList(); + ArrayList oldGbyKeys = oldConf.getKeys(); + for (int i =0; i< oldGbyKeys.size(); i++) { + ExprNodeDesc expr = oldGbyKeys.get(i).clone(); + if(expr instanceof ExprNodeColumnDesc){ + ExprNodeColumnDesc colExpr = (ExprNodeColumnDesc)expr; + String col = colExpr.getColumn(); + if(subqueryCtx.getSelectColumnNames().contains(col)){ + colExpr.setColumn(subqueryCtx.getAliasToInternal().get(col)); + } + expr = colExpr; + }else if(expr instanceof ExprNodeGenericFuncDesc){ + ExprNodeGenericFuncDesc funcExpr = (ExprNodeGenericFuncDesc)expr; + List childExprList = funcExpr.getChildExprs(); + for (ExprNodeDesc childExpr : childExprList) { + if(childExpr instanceof ExprNodeColumnDesc){ + ExprNodeColumnDesc colExpr = (ExprNodeColumnDesc)childExpr; + String col = colExpr.getColumn(); + if(subqueryCtx.getSelectColumnNames().contains(col)){ + colExpr.setColumn(subqueryCtx.getAliasToInternal().get(col)); + } + + } + } + } + newGbyKeys.add(expr); + } + + //Construct the new RowSchema. We do not need a alias for the new internalNames + RowSchema oldRS = operator.getSchema(); + ArrayList oldSign = oldRS.getSignature(); + ArrayList newSign = new ArrayList(); + for (ColumnInfo columnInfo : oldSign) { + columnInfo.setAlias(null); + newSign.add(columnInfo); + } + + //reset the above data structures in the original GroupByOperator + oldRS.setSignature(newSign); + operator.setSchema(oldRS); + oldConf.setKeys(newGbyKeys); + oldConf.setAggregators(oldAggrList); + operator.setColumnExprMap(newGbyColExprMap); + operator.setConf(oldConf); + + }else{ + //we just need to reset the GenericUDAFEvaluator and its name for this GroupByOperator whose parent is the + //ReduceSinkOperator + GroupByDesc childConf = (GroupByDesc) operator.getConf(); + ArrayList childAggrList = childConf.getAggregators(); + if(childAggrList != null && childAggrList.size() > 0){ + for (AggregationDesc aggregationDesc : childAggrList) { + aggregationDesc.setGenericUDAFEvaluator(subqueryCtx.getEval()); + aggregationDesc.setGenericUDAFName("sum"); + } + } + + } + + return null; + } + } + + public static NewQueryGroupbySchemaProc getNewQueryGroupbySchemaProc(){ + return new NewQueryGroupbySchemaProc(); + } + + + /** + * This processor corrects the RowResolver for the FilterOperator of the original operator tree using + * the RowResolver obtained from the subquery SelectOperator in SubquerySelectSchemaProc processor. + * It also needs to replace the current internal names with new internal names for all instances of the + * ExprNodeColumnDesc. It recursively calls the setFilterPredicateCol(..) method to set this information correctly. + * + */ + private static class NewQueryFilterSchemaProc implements NodeProcessor { + public Object process(Node nd, Stack stack, NodeProcessorCtx ctx, + Object... nodeOutputs) throws SemanticException { + FilterOperator operator = (FilterOperator)nd; + subqueryCtx = (RewriteIndexSubqueryCtx)ctx; + //Set new RowResolver + operator.getSchema().setSignature(subqueryCtx.getNewRS()); + subqueryCtx.getParseContext().getOpParseCtx().get(operator).setRowResolver(subqueryCtx.getNewRR()); + + //Set correct internalNames + FilterDesc conf = operator.getConf(); + ExprNodeDesc expr = conf.getPredicate(); + setFilterPredicateCol(expr); + conf.setPredicate(expr); + return null; + } + } + + + /** + * This method is recursively called whenever we have our expression node descriptor to be an instance of the ExprNodeGenericFuncDesc. + * We exit the recursion when we find an instance of ExprNodeColumnDesc and set its column name to internal name + * @param expr + */ + private static void setFilterPredicateCol(ExprNodeDesc expr){ + if(expr instanceof ExprNodeColumnDesc){ + ExprNodeColumnDesc colExpr = (ExprNodeColumnDesc)expr; + String col = colExpr.getColumn(); + if(subqueryCtx.getSelectColumnNames().contains(col)){ + colExpr.setColumn(subqueryCtx.getAliasToInternal().get(col)); + } + expr = colExpr; + }else if(expr instanceof ExprNodeGenericFuncDesc){ + ExprNodeGenericFuncDesc funcExpr = (ExprNodeGenericFuncDesc)expr; + List childExprList = funcExpr.getChildExprs(); + for (ExprNodeDesc childExpr : childExprList) { + //continue until you find an instance of the ExprNodeColumnDesc + setFilterPredicateCol(childExpr); + } + } + + } + + + public static NewQueryFilterSchemaProc getNewQueryFilterSchemaProc(){ + return new NewQueryFilterSchemaProc(); + } + + +} diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/index/RewriteRemoveGroupbyCtx.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/index/RewriteRemoveGroupbyCtx.java new file mode 100644 index 0000000..084a43e --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/index/RewriteRemoveGroupbyCtx.java @@ -0,0 +1,237 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.optimizer.index; + +import java.io.Serializable; +import java.util.ArrayList; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Stack; + +import org.apache.hadoop.hive.ql.exec.Operator; +import org.apache.hadoop.hive.ql.exec.TableScanOperator; +import org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher; +import org.apache.hadoop.hive.ql.lib.Dispatcher; +import org.apache.hadoop.hive.ql.lib.GraphWalker; +import org.apache.hadoop.hive.ql.lib.Node; +import org.apache.hadoop.hive.ql.lib.NodeProcessor; +import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx; +import org.apache.hadoop.hive.ql.lib.PreOrderWalker; +import org.apache.hadoop.hive.ql.lib.Rule; +import org.apache.hadoop.hive.ql.lib.RuleRegExp; +import org.apache.hadoop.hive.ql.metadata.Hive; +import org.apache.hadoop.hive.ql.parse.ASTNode; +import org.apache.hadoop.hive.ql.parse.HiveParser; +import org.apache.hadoop.hive.ql.parse.OpParseContext; +import org.apache.hadoop.hive.ql.parse.ParseContext; +import org.apache.hadoop.hive.ql.parse.SemanticException; + +/** + * RewriteRemoveGroupbyCtx class stores the context for the {@link RewriteRemoveGroupbyProcFactory} processor factory methods + */ +public class RewriteRemoveGroupbyCtx implements NodeProcessorCtx { + + private RewriteRemoveGroupbyCtx(ParseContext parseContext, Hive hiveDb, String indexTableName){ + //this prevents the class from getting instantiated + this.parseContext = parseContext; + this.hiveDb = hiveDb; + this.indexName = indexTableName; + this.opc = parseContext.getOpParseCtx(); + } + + public static RewriteRemoveGroupbyCtx getInstance(ParseContext parseContext, Hive hiveDb, String indexTableName){ + return new RewriteRemoveGroupbyCtx(parseContext, hiveDb, indexTableName); + } + + //We need these two ArrayLists to reset the parent operator list and child operator list in the operator tree + // once we remove the operators that represent the group-by construct + private final List> newParentList = new ArrayList>(); + private final List> newChildrenList = new ArrayList>(); + + //We need to remove the operators from OpParseContext to remove them from the operator tree + private LinkedHashMap, OpParseContext> opc = new LinkedHashMap, OpParseContext>(); + private final Hive hiveDb; + private final ParseContext parseContext; + + private final String indexName; + + public List> getNewParentList() { + return newParentList; + } + + public List> getNewChildrenList() { + return newChildrenList; + } + + public LinkedHashMap, OpParseContext> getOpc() { + return opc; + } + + public ParseContext getParseContext() { + return parseContext; + } + + public Hive getHiveDb() { + return hiveDb; + } + + public String getIndexName() { + return indexName; + } + + /** + * Given a root node of the parse tree, this function returns the "first" TOK_FUNCTION node + * that matches the input function name + * + * @param root + * @return + */ + ASTNode getFuncNode(ASTNode root, String funcName){ + ASTNode func = null; + ArrayList cList = root.getChildren(); + while(cList != null && cList.size() > 0){ + for (Node node : cList) { + if(null != node){ + ASTNode curr = (ASTNode)node; + if(curr.getType() == HiveParser.TOK_TABLE_OR_COL){ + ArrayList funcChildren = curr.getChildren(); + for (Node child : funcChildren) { + ASTNode funcChild = (ASTNode)child; + if(funcChild.getText().equals(funcName)){ + func = curr; + cList = null; + break; + } + } + }else{ + cList = curr.getChildren(); + continue; + } + } + } + } + return func; + } + + + /** + * Given an input operator, this function returns the top TableScanOperator for the operator tree + * @param inputOp + * @return + */ + Operator getTopOperator(Operator inputOp){ + Operator tsOp = null; + List> parentList = inputOp.getParentOperators(); + while(parentList != null && parentList.size() > 0){ + for (Operator op : parentList) { + if(op != null){ + if(op instanceof TableScanOperator){ + tsOp = (TableScanOperator) op; + parentList = null; + break; + }else{ + parentList = op.getParentOperators(); + continue; + } + } + } + } + + return tsOp; + } + + + /** + * Walk the original operator tree using the {@link PreOrderWalker} using the rules. + * Each of the rules invoke respective methods from the {@link RewriteRemoveGroupbyProcFactory} + * to remove the group-by constructs from the original query and replace the original + * {@link TableScanOperator} with the new index table scan operator. + * + * @param topOp + * @throws SemanticException + */ + public void invokeRemoveGbyProc(Operator topOp) throws SemanticException{ + Map opRules = new LinkedHashMap(); + + // replace scan operator containing original table with index table + opRules.put(new RuleRegExp("R1", "TS%"), RewriteRemoveGroupbyProcFactory.getReplaceTableScanProc()); + //rule that replaces index key selection with size(_offsets) function in original query + opRules.put(new RuleRegExp("R2", "SEL%"), RewriteRemoveGroupbyProcFactory.getReplaceIdxKeyWithSizeFuncProc()); + // remove group-by pattern from original operator tree + opRules.put(new RuleRegExp("R3", "GBY%RS%GBY%"), RewriteRemoveGroupbyProcFactory.getRemoveGroupByProc()); + + // The dispatcher fires the processor corresponding to the closest matching + // rule and passes the context along + Dispatcher disp = new DefaultRuleDispatcher(getDefaultProc(), opRules, this); + GraphWalker ogw = new PreOrderWalker(disp); + + // Create a list of topop nodes + ArrayList topNodes = new ArrayList(); + topNodes.add(topOp); + ogw.startWalking(topNodes, null); + + } + + + /** + * Walk the original operator tree using the {@link PreOrderWalker} using the rules. + * Each of the rules invoke respective methods from the {@link RewriteRemoveGroupbyProcFactory} + * to replace the original {@link TableScanOperator} with the new index table scan operator. + * + * @param topOp + * @throws SemanticException + */ + public void invokeReplaceTableScanProc(Operator topOp) throws SemanticException{ + Map opRules = new LinkedHashMap(); + + // replace scan operator containing original table with index table + opRules.put(new RuleRegExp("R1", "TS%"), RewriteRemoveGroupbyProcFactory.getReplaceTableScanProc()); + + // The dispatcher fires the processor corresponding to the closest matching + // rule and passes the context along + Dispatcher disp = new DefaultRuleDispatcher(getDefaultProc(), opRules, this); + GraphWalker ogw = new PreOrderWalker(disp); + + // Create a list of topop nodes + ArrayList topNodes = new ArrayList(); + topNodes.add(topOp); + ogw.startWalking(topNodes, null); + + } + + /** + * Default procedure for {@link DefaultRuleDispatcher} + * @return + */ + private NodeProcessor getDefaultProc() { + return new NodeProcessor() { + @Override + public Object process(Node nd, Stack stack, + NodeProcessorCtx procCtx, Object... nodeOutputs) throws SemanticException { + return null; + } + }; + } + + + + +} + diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/index/RewriteRemoveGroupbyProcFactory.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/index/RewriteRemoveGroupbyProcFactory.java new file mode 100644 index 0000000..cff139b --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/index/RewriteRemoveGroupbyProcFactory.java @@ -0,0 +1,343 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.optimizer.index; + +import java.io.IOException; +import java.io.Serializable; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.Stack; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.ql.Context; +import org.apache.hadoop.hive.ql.exec.ColumnInfo; +import org.apache.hadoop.hive.ql.exec.GroupByOperator; +import org.apache.hadoop.hive.ql.exec.Operator; +import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator; +import org.apache.hadoop.hive.ql.exec.RowSchema; +import org.apache.hadoop.hive.ql.exec.SelectOperator; +import org.apache.hadoop.hive.ql.exec.TableScanOperator; +import org.apache.hadoop.hive.ql.lib.Node; +import org.apache.hadoop.hive.ql.lib.NodeProcessor; +import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx; +import org.apache.hadoop.hive.ql.metadata.Table; +import org.apache.hadoop.hive.ql.parse.ASTNode; +import org.apache.hadoop.hive.ql.parse.BaseSemanticAnalyzer; +import org.apache.hadoop.hive.ql.parse.OpParseContext; +import org.apache.hadoop.hive.ql.parse.ParseDriver; +import org.apache.hadoop.hive.ql.parse.ParseException; +import org.apache.hadoop.hive.ql.parse.ParseUtils; +import org.apache.hadoop.hive.ql.parse.RowResolver; +import org.apache.hadoop.hive.ql.parse.SemanticAnalyzer; +import org.apache.hadoop.hive.ql.parse.SemanticAnalyzerFactory; +import org.apache.hadoop.hive.ql.parse.SemanticException; +import org.apache.hadoop.hive.ql.parse.BaseSemanticAnalyzer.tableSpec; +import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc; +import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; +import org.apache.hadoop.hive.ql.plan.SelectDesc; +import org.apache.hadoop.hive.ql.plan.TableScanDesc; +import org.apache.hadoop.hive.serde2.SerDeException; +import org.apache.hadoop.hive.serde2.objectinspector.StructField; +import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; + +/** + * Factory of processors used by {@link RewriteGBUsingIndex} (see invokeRemoveGbyProc(..) method) + * Each of the processors are invoked according to a rule and serve towards removing + * group-by construct from original operator tree + * + */ +public final class RewriteRemoveGroupbyProcFactory { + protected final static Log LOG = LogFactory.getLog(RewriteRemoveGroupbyProcFactory.class.getName()); + private static RewriteRemoveGroupbyCtx removeGbyCtx = null; + + private RewriteRemoveGroupbyProcFactory() { + //this prevents the class from getting instantiated + } + + /** + * This processor removes the SelectOperator whose child is a GroupByOperator from the operator tree (OpParseContext). + * When we remove the group-by construct from the query, we do not need this SelectOperator which worked initially as an + * interim operator to pass arguments from the parent TableScanOperator to the child GroupByOperator (Remember that the genPlan(..) + * method creates the operators bottom-up FROM-WHERE-GROUPBY-ORDER-BY-SELECT etc) + * + * Since we need to remove the group-by construct (comprising of GBY-RS-GBY operators and interim SEL operator), the processor sets the + * appropriate parent-child links. + * + * The processor also constructs a ExprNodeDesc instance for the _countkey and replaces the index key columns + * with this function descriptor. It also sets the rowSchema, colList and colExprMap data structures correctly for this SelectOperator + * to accommodate the new replacement and removal of group-by construct + * + */ + private static class ReplaceIdxKeyWithSizeFunc implements NodeProcessor { + public Object process(Node nd, Stack stack, NodeProcessorCtx ctx, + Object... nodeOutputs) throws SemanticException { + SelectOperator operator = (SelectOperator)nd; + removeGbyCtx = (RewriteRemoveGroupbyCtx)ctx; + + //as of now, we have hard-coded the positions as get(0) etc as whenever a group-by construct appears in teh operator tree, + //it comes in the SEL-GBY-RS-SEL combination. This lets us presume that the parent or child operator will always be + // at the 0th position in the DAG operator tree + List> childrenList = operator.getChildOperators(); + Operator child = childrenList.get(0); + Operator parent = operator.getParentOperators().get(0); + + if(child instanceof GroupByOperator){ + //this is the interim SEL operator for the group-by construct, we do not need this in the re-written operator tree + removeGbyCtx.getNewParentList().addAll(operator.getParentOperators()); + removeGbyCtx.getOpc().remove(operator); + }else if(parent instanceof GroupByOperator){ + + // set the child operator list of interim SEL's parent operator to be the child operator list of the GroupByOperator + removeGbyCtx.getNewParentList().get(0).setChildOperators(removeGbyCtx.getNewChildrenList()); + // set the parent operator list for the SelectOperator (whose parent operator is GroupByOperator) + //to be the parent list of interim SEL operator + removeGbyCtx.getNewChildrenList().get(0).setParentOperators(removeGbyCtx.getNewParentList()); + + //This code parses the string command and constructs a ASTNode parse tree + //we need this to construct the ExprNodeDesc for the _countkey column + HiveConf conf = removeGbyCtx.getParseContext().getConf(); + Context context = null; + ASTNode tree = null; + BaseSemanticAnalyzer sem = null; + String newSelCommand = "select `_countkey` from " + removeGbyCtx.getIndexName(); + try { + context = new Context(conf); + ParseDriver pd = new ParseDriver(); + tree = pd.parse(newSelCommand, context); + tree = ParseUtils.findRootNonNullToken(tree); + sem = SemanticAnalyzerFactory.get(conf, tree); + + } catch (ParseException e) { + LOG.info("ParseException in ReplaceIdxKeyWithSizeFunc"); + e.printStackTrace(); + } catch (SemanticException e) { + LOG.info("SemanticException in ReplaceIdxKeyWithSizeFunc"); + e.printStackTrace(); + } catch (IOException e) { + LOG.info("IOException in ReplaceIdxKeyWithSizeFunc"); + e.printStackTrace(); + } + + //We retrieve the ASTNode function token from the root tree + ASTNode funcNode = removeGbyCtx.getFuncNode(tree, "`_countkey`"); + + //We need the rowResolver of the parent TableScanOperator to fix the rowSchema, colList, colExprMap of the SelectOperator + //and also to construct the ExprNodeDesc to replace the index key columns with _countkey + LinkedHashMap, OpParseContext> opCtxMap = + removeGbyCtx.getParseContext().getOpParseCtx(); + Operator tsOp = removeGbyCtx.getTopOperator(operator); + OpParseContext tsCtx = opCtxMap.get(tsOp); + ExprNodeDesc expr1 = ((SemanticAnalyzer) sem).genExprNodeDesc(funcNode, tsCtx.getRowResolver()); + String countCol = ""; + + if(expr1 instanceof ExprNodeColumnDesc){ + countCol = ((ExprNodeColumnDesc) expr1).getColumn(); + } + + SelectDesc selDesc = (SelectDesc) operator.getConf(); + //Since we have removed the interim SEL operator when we removed the group-by construct, we need to get rid + //of the internal names in the colList and colExprMap of this SelectOperator + //internalToAlias map gives us this mapping to correct these data structures + HashMap internalToAlias = new LinkedHashMap(); + + //Set the new RowSchema and populate the internalToAlias map + RowSchema rs = operator.getSchema(); + ArrayList newRS = new ArrayList(); + ArrayList sign = rs.getSignature(); + for (ColumnInfo columnInfo : sign) { + String alias = columnInfo.getAlias(); + String internalName = columnInfo.getInternalName(); + internalToAlias.put(internalName, alias); + //the function name always has alias starting with _c (for eg. _c1 etc) + //We need to set the new alias (_offsets) for the initial "_c1" in rowSchema + if(alias != null && alias.startsWith("_c")){ + columnInfo.setAlias(countCol); + } + newRS.add(columnInfo); + } + operator.getSchema().setSignature(newRS); + + //Set the colList of this SelectOperator + ArrayList colList = selDesc.getColList(); + int i = 0; + for (; i< colList.size(); i++) { + ExprNodeDesc expr2 = colList.get(i); + if(expr2 instanceof ExprNodeColumnDesc){ + String internal = ((ExprNodeColumnDesc)expr2).getColumn(); + //get rid of the internal column names like _col0, _col1 and replace them with their actual names i.e. alias + if(internalToAlias.get(internal) != null){ + ((ExprNodeColumnDesc) expr2).setColumn(internalToAlias.get(internal)); + } + //however, if the alias itself is the internal name of the function argument, say _c1, we need to replace the + //ExprNodeColumnDesc instance with the ExprNodeGenericFuncDesc (i.e. exprNode here) + //this replaces the count(literal) or count(index_key) function with _countkey + if(((ExprNodeColumnDesc) expr2).getColumn().startsWith("_c")){ + colList.set(i, expr1); + } + } + } + + selDesc.setColList(colList); + + //Set the new colExprMap for this SelectOperator + Map origColExprMap = operator.getColumnExprMap(); + Map newColExprMap = new LinkedHashMap(); + Set internalNamesList = origColExprMap.keySet(); + for (String internal : internalNamesList) { + ExprNodeDesc internalExpr = origColExprMap.get(internal).clone(); + if(internalExpr instanceof ExprNodeColumnDesc){ + //get rid of the internal column names like _col0, _col1 and replace them with their actual names i.e. alias + if(internalToAlias.get(internal) != null){ + ((ExprNodeColumnDesc) internalExpr).setColumn(internalToAlias.get(internal)); + } + //this replaces the count(literal) or count(index_key) function with _countkey + if(((ExprNodeColumnDesc) internalExpr).getColumn().startsWith("_c")){ + newColExprMap.put(internal, expr1); + }else{ + newColExprMap.put(internal, internalExpr); + } + }else{ + newColExprMap.put(internal, internalExpr); + } + } + operator.setColumnExprMap(newColExprMap); + } + return null; + } + } + + public static ReplaceIdxKeyWithSizeFunc getReplaceIdxKeyWithSizeFuncProc(){ + return new ReplaceIdxKeyWithSizeFunc(); + } + + + /** + * This processor replaces the original TableScanOperator with the new TableScanOperator and metadata that scans over the + * index table rather than scanning over the orginal table. + * + */ + private static class RepaceTableScanOpProc implements NodeProcessor { + public Object process(Node nd, Stack stack, NodeProcessorCtx ctx, + Object... nodeOutputs) throws SemanticException { + TableScanOperator scanOperator = (TableScanOperator)nd; + removeGbyCtx = (RewriteRemoveGroupbyCtx)ctx; + + HashMap topToTable = + removeGbyCtx.getParseContext().getTopToTable(); + + //construct a new descriptor for the index table scan + TableScanDesc indexTableScanDesc = new TableScanDesc(); + indexTableScanDesc.setGatherStats(false); + + //String tableName = removeGbyCtx.getCanApplyCtx().findBaseTable(baseTableName); + String tableName = removeGbyCtx.getIndexName(); + + tableSpec ts = new tableSpec(removeGbyCtx.getHiveDb(), + removeGbyCtx.getParseContext().getConf(), + tableName + ); + String k = tableName + Path.SEPARATOR; + indexTableScanDesc.setStatsAggPrefix(k); + scanOperator.setConf(indexTableScanDesc); + + //remove original TableScanOperator + topToTable.clear(); + removeGbyCtx.getParseContext().getTopOps().clear(); + + //Scan operator now points to other table + scanOperator.setAlias(tableName); + topToTable.put(scanOperator, ts.tableHandle); + removeGbyCtx.getParseContext().setTopToTable(topToTable); + + OpParseContext operatorContext = + removeGbyCtx.getParseContext().getOpParseCtx().get(scanOperator); + RowResolver rr = new RowResolver(); + removeGbyCtx.getParseContext().getOpParseCtx().remove(scanOperator); + + + //Construct the new RowResolver for the new TableScanOperator + try { + StructObjectInspector rowObjectInspector = (StructObjectInspector) ts.tableHandle.getDeserializer().getObjectInspector(); + List fields = rowObjectInspector + .getAllStructFieldRefs(); + for (int i = 0; i < fields.size(); i++) { + rr.put(tableName, fields.get(i).getFieldName(), new ColumnInfo(fields + .get(i).getFieldName(), TypeInfoUtils + .getTypeInfoFromObjectInspector(fields.get(i) + .getFieldObjectInspector()), tableName, false)); + } + } catch (SerDeException e) { + throw new RuntimeException(e); + } + //Set row resolver for new table + operatorContext.setRowResolver(rr); + + //Put the new TableScanOperator in the OpParseContext and topOps maps of the original ParseContext + removeGbyCtx.getParseContext().getOpParseCtx().put(scanOperator, operatorContext); + removeGbyCtx.getParseContext().getTopOps().put(tableName, scanOperator); + return null; + } + } + + public static RepaceTableScanOpProc getReplaceTableScanProc(){ + return new RepaceTableScanOpProc(); + } + + /** + * This processor removes the GroupBy operators and the interim ReduceSinkOperator from the OpParseContext + * + */ + private static class RemoveGBYProc implements NodeProcessor { + public Object process(Node nd, Stack stack, NodeProcessorCtx ctx, + Object... nodeOutputs) throws SemanticException { + GroupByOperator operator = (GroupByOperator)nd; + removeGbyCtx = (RewriteRemoveGroupbyCtx)ctx; + //On walking the operator tree using the rule 'GBY-RS-GBY', we get the GroupByOperator that is not in the 'groupOpToInputTables' + //map in the ParseContext. Hence the check. + if(!removeGbyCtx.getParseContext().getGroupOpToInputTables().containsKey(operator)){ + removeGbyCtx.getNewChildrenList().addAll(operator.getChildOperators()); + + ReduceSinkOperator rsOp = (ReduceSinkOperator) operator.getParentOperators().get(0); + removeGbyCtx.getOpc().remove(rsOp); + + GroupByOperator gbyOp = (GroupByOperator) rsOp.getParentOperators().get(0); + //we need to remove this GBY operator from the groupOpToInputTables map from ParseContext as well + removeGbyCtx.getParseContext().getGroupOpToInputTables().remove(gbyOp); + removeGbyCtx.getOpc().remove(gbyOp); + + } + + return null; + } + } + + public static RemoveGBYProc getRemoveGroupByProc(){ + return new RemoveGBYProc(); + } + + +} diff --git ql/src/java/org/apache/hadoop/hive/ql/parse/BaseSemanticAnalyzer.java ql/src/java/org/apache/hadoop/hive/ql/parse/BaseSemanticAnalyzer.java index 04f560f..ed4c158 100644 --- ql/src/java/org/apache/hadoop/hive/ql/parse/BaseSemanticAnalyzer.java +++ ql/src/java/org/apache/hadoop/hive/ql/parse/BaseSemanticAnalyzer.java @@ -600,6 +600,22 @@ public abstract class BaseSemanticAnalyzer { public static enum SpecType {TABLE_ONLY, STATIC_PARTITION, DYNAMIC_PARTITION}; public SpecType specType; + public tableSpec(Hive db, HiveConf conf, String tableName) throws SemanticException { + this.tableName = tableName; + + try { + this.tableHandle = db.getTable(tableName); + } catch (HiveException e) { + //XTODO: Throw semantic exception here + throw new SemanticException(ErrorMsg.GENERIC_ERROR.getMsg(tableName), e); + } + this.specType = SpecType.TABLE_ONLY; + + } + private Table getTable(String tableName2) { + // TODO Auto-generated method stub + return null; + } public tableSpec(Hive db, HiveConf conf, ASTNode ast) throws SemanticException { diff --git ql/src/test/queries/clientpositive/ql_rewrite_gbtoidx.q ql/src/test/queries/clientpositive/ql_rewrite_gbtoidx.q new file mode 100644 index 0000000..d3792a2 --- /dev/null +++ ql/src/test/queries/clientpositive/ql_rewrite_gbtoidx.q @@ -0,0 +1,162 @@ + +DROP TABLE lineitem; +CREATE TABLE lineitem (L_ORDERKEY INT, + L_PARTKEY INT, + L_SUPPKEY INT, + L_LINENUMBER INT, + L_QUANTITY DOUBLE, + L_EXTENDEDPRICE DOUBLE, + L_DISCOUNT DOUBLE, + L_TAX DOUBLE, + L_RETURNFLAG STRING, + L_LINESTATUS STRING, + l_shipdate STRING, + L_COMMITDATE STRING, + L_RECEIPTDATE STRING, + L_SHIPINSTRUCT STRING, + L_SHIPMODE STRING, + L_COMMENT STRING) +ROW FORMAT DELIMITED +FIELDS TERMINATED BY '|'; + +CREATE INDEX lineitem_lshipdate_idx ON TABLE lineitem(l_shipdate) AS 'org.apache.hadoop.hive.ql.index.AggregateIndexHandler' WITH DEFERRED REBUILD; +ALTER INDEX lineitem_lshipdate_idx ON lineitem REBUILD; + +set hive.optimize.index.groupby=true; + +explain select l_shipdate, + count(1) +from +lineitem +group by l_shipdate; + + +explain select lastyear.month, + thisyear.month, + (thisyear.monthly_shipments - lastyear.monthly_shipments) / +lastyear.monthly_shipments as monthly_shipments_delta + from (select year(l_shipdate) as year, + month(l_shipdate) as month, + count(1) as monthly_shipments + from lineitem + where year(l_shipdate) = 1997 + group by year(l_shipdate), month(l_shipdate) + ) lastyear join + (select year(l_shipdate) as year, + month(l_shipdate) as month, + count(1) as monthly_shipments + from lineitem + where year(l_shipdate) = 1998 + group by year(l_shipdate), month(l_shipdate) + ) thisyear + on lastyear.month = thisyear.month; + + + +explain select year(l_shipdate) as year, + month(l_shipdate) as month, + count(1) as monthly_shipments +from lineitem +group by year(l_shipdate), month(l_shipdate); + + + +explain select year(l_shipdate) as year, + month(l_shipdate) as month, + sum(sz) +from ( +select l_shipdate, `_countkey` as sz +from default__lineitem_lineitem_lshipdate_idx__ +) t +group by year(l_shipdate), month(l_shipdate); + + +explain select year(l_shipdate) as year, + month(l_shipdate) as month, + count(1) as monthly_shipments +from lineitem +group by year(l_shipdate), month(l_shipdate); + + + + + + + + + + + +explain select year(l_shipdate) as year, + month(l_shipdate) as month, + sum(sz) +from ( +select l_shipdate, `_countkey` as sz +from default__lineitem_lineitem_lshipdate_idx__ +) t +group by year(l_shipdate), month(l_shipdate); + + +explain select year(L_SHIPDATE), month(L_SHIPDATE) as month_bkt, COUNT(1) + from lineitem +group by year(L_SHIPDATE), month(L_SHIPDATE); + + +explain select lastyear.month, + thisyear.month, + (thisyear.monthly_shipments - lastyear.monthly_shipments) / +lastyear.monthly_shipments as monthly_shipments_delta + from (select year(l_shipdate) as year, + month(l_shipdate) as month, + count(1) as monthly_shipments + from lineitem + where year(l_shipdate) = 1997 + group by year(l_shipdate), month(l_shipdate) + ) lastyear join + (select year(l_shipdate) as year, + month(l_shipdate) as month, + count(1) as monthly_shipments + from lineitem + where year(l_shipdate) = 1998 + group by year(l_shipdate), month(l_shipdate) + ) thisyear + on lastyear.month = thisyear.month + and lastyear.year = thisyear.year; + + +DROP TABLE tbl; +CREATE TABLE tbl(key int, value int); +CREATE INDEX tbl_key_idx ON TABLE tbl(key) AS 'org.apache.hadoop.hive.ql.index.AggregateIndexHandler' WITH DEFERRED REBUILD; +ALTER INDEX tbl_key_idx ON tbl REBUILD; +set hive.optimize.gbyusingindex=true; +EXPLAIN select key, count(key) from tbl where key = 1 group by key; +EXPLAIN SELECT DISTINCT key FROM tbl; +EXPLAIN select count(1) from tbl; +EXPLAIN select key, count(key) from tbl group by key; +EXPLAIN select count(key) from tbl; +EXPLAIN SELECT DISTINCT key FROM tbl; +EXPLAIN SELECT key FROM tbl GROUP BY key; + +EXPLAIN SELECT DISTINCT key FROM tbl; +EXPLAIN SELECT DISTINCT key, value FROM tbl; + +EXPLAIN SELECT key FROM tbl GROUP BY key; +EXPLAIN SELECT key FROM tbl GROUP BY value, key; +EXPLAIN SELECT key, value FROM tbl GROUP BY value, key; + +EXPLAIN SELECT DISTINCT key, value FROM tbl WHERE value = 2; +EXPLAIN SELECT DISTINCT key, value FROM tbl WHERE value = 2 AND key = 3; +EXPLAIN SELECT DISTINCT key, value FROM tbl WHERE value = key; + + +EXPLAIN SELECT key FROM tbl WHERE key = 3 GROUP BY key; +EXPLAIN SELECT key, value FROM tbl WHERE value = 1 GROUP BY key, value; + +EXPLAIN SELECT * FROM (SELECT DISTINCT key, value FROM tbl) v1 WHERE v1.value = 2; + +EXPLAIN SELECT key FROM tbl WHERE value = 2 GROUP BY key; +EXPLAIN SELECT DISTINCT key, substr(value,2,3) FROM tbl WHERE value = key; +EXPLAIN SELECT DISTINCT key, substr(value,2,3) FROM tbl; +EXPLAIN SELECT key FROM tbl GROUP BY key, substr(key,2,3); + +DROP TABLE tbl; diff --git ql/src/test/results/clientpositive/ql_rewrite_gbtoidx.q.out ql/src/test/results/clientpositive/ql_rewrite_gbtoidx.q.out new file mode 100644 index 0000000..45435f6 --- /dev/null +++ ql/src/test/results/clientpositive/ql_rewrite_gbtoidx.q.out @@ -0,0 +1,2863 @@ +PREHOOK: query: DROP TABLE lineitem +PREHOOK: type: DROPTABLE +POSTHOOK: query: DROP TABLE lineitem +POSTHOOK: type: DROPTABLE +PREHOOK: query: CREATE TABLE lineitem (L_ORDERKEY INT, + L_PARTKEY INT, + L_SUPPKEY INT, + L_LINENUMBER INT, + L_QUANTITY DOUBLE, + L_EXTENDEDPRICE DOUBLE, + L_DISCOUNT DOUBLE, + L_TAX DOUBLE, + L_RETURNFLAG STRING, + L_LINESTATUS STRING, + l_shipdate STRING, + L_COMMITDATE STRING, + L_RECEIPTDATE STRING, + L_SHIPINSTRUCT STRING, + L_SHIPMODE STRING, + L_COMMENT STRING) +ROW FORMAT DELIMITED +FIELDS TERMINATED BY '|' +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE lineitem (L_ORDERKEY INT, + L_PARTKEY INT, + L_SUPPKEY INT, + L_LINENUMBER INT, + L_QUANTITY DOUBLE, + L_EXTENDEDPRICE DOUBLE, + L_DISCOUNT DOUBLE, + L_TAX DOUBLE, + L_RETURNFLAG STRING, + L_LINESTATUS STRING, + l_shipdate STRING, + L_COMMITDATE STRING, + L_RECEIPTDATE STRING, + L_SHIPINSTRUCT STRING, + L_SHIPMODE STRING, + L_COMMENT STRING) +ROW FORMAT DELIMITED +FIELDS TERMINATED BY '|' +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@lineitem +PREHOOK: query: CREATE INDEX lineitem_lshipdate_idx ON TABLE lineitem(l_shipdate) AS 'org.apache.hadoop.hive.ql.index.AggregateIndexHandler' WITH DEFERRED REBUILD +PREHOOK: type: CREATEINDEX +POSTHOOK: query: CREATE INDEX lineitem_lshipdate_idx ON TABLE lineitem(l_shipdate) AS 'org.apache.hadoop.hive.ql.index.AggregateIndexHandler' WITH DEFERRED REBUILD +POSTHOOK: type: CREATEINDEX +PREHOOK: query: ALTER INDEX lineitem_lshipdate_idx ON lineitem REBUILD +PREHOOK: type: QUERY +PREHOOK: Input: default@lineitem +PREHOOK: Output: default@default__lineitem_lineitem_lshipdate_idx__ +POSTHOOK: query: ALTER INDEX lineitem_lshipdate_idx ON lineitem REBUILD +POSTHOOK: type: QUERY +POSTHOOK: Input: default@lineitem +POSTHOOK: Output: default@default__lineitem_lineitem_lshipdate_idx__ +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._countall EXPRESSION [(lineitem)lineitem.null, ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._countkey EXPRESSION [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +PREHOOK: query: explain select l_shipdate, + count(1) +from +lineitem +group by l_shipdate +PREHOOK: type: QUERY +POSTHOOK: query: explain select l_shipdate, + count(1) +from +lineitem +group by l_shipdate +POSTHOOK: type: QUERY +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._countall EXPRESSION [(lineitem)lineitem.null, ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._countkey EXPRESSION [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME lineitem))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL l_shipdate)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL l_shipdate)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + default__lineitem_lineitem_lshipdate_idx__ + TableScan + Select Operator + expressions: + expr: l_shipdate + type: string + expr: _countkey + type: int + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: explain select lastyear.month, + thisyear.month, + (thisyear.monthly_shipments - lastyear.monthly_shipments) / +lastyear.monthly_shipments as monthly_shipments_delta + from (select year(l_shipdate) as year, + month(l_shipdate) as month, + count(1) as monthly_shipments + from lineitem + where year(l_shipdate) = 1997 + group by year(l_shipdate), month(l_shipdate) + ) lastyear join + (select year(l_shipdate) as year, + month(l_shipdate) as month, + count(1) as monthly_shipments + from lineitem + where year(l_shipdate) = 1998 + group by year(l_shipdate), month(l_shipdate) + ) thisyear + on lastyear.month = thisyear.month +PREHOOK: type: QUERY +POSTHOOK: query: explain select lastyear.month, + thisyear.month, + (thisyear.monthly_shipments - lastyear.monthly_shipments) / +lastyear.monthly_shipments as monthly_shipments_delta + from (select year(l_shipdate) as year, + month(l_shipdate) as month, + count(1) as monthly_shipments + from lineitem + where year(l_shipdate) = 1997 + group by year(l_shipdate), month(l_shipdate) + ) lastyear join + (select year(l_shipdate) as year, + month(l_shipdate) as month, + count(1) as monthly_shipments + from lineitem + where year(l_shipdate) = 1998 + group by year(l_shipdate), month(l_shipdate) + ) thisyear + on lastyear.month = thisyear.month +POSTHOOK: type: QUERY +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._countall EXPRESSION [(lineitem)lineitem.null, ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._countkey EXPRESSION [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME lineitem))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTION year (TOK_TABLE_OR_COL l_shipdate)) year) (TOK_SELEXPR (TOK_FUNCTION month (TOK_TABLE_OR_COL l_shipdate)) month) (TOK_SELEXPR (TOK_FUNCTION count 1) monthly_shipments)) (TOK_WHERE (= (TOK_FUNCTION year (TOK_TABLE_OR_COL l_shipdate)) 1997)) (TOK_GROUPBY (TOK_FUNCTION year (TOK_TABLE_OR_COL l_shipdate)) (TOK_FUNCTION month (TOK_TABLE_OR_COL l_shipdate))))) lastyear) (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME lineitem))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTION year (TOK_TABLE_OR_COL l_shipdate)) year) (TOK_SELEXPR (TOK_FUNCTION month (TOK_TABLE_OR_COL l_shipdate)) month) (TOK_SELEXPR (TOK_FUNCTION count 1) monthly_shipments)) (TOK_WHERE (= (TOK_FUNCTION year (TOK_TABLE_OR_COL l_shipdate)) 1998)) (TOK_GROUPBY (TOK_FUNCTION year (TOK_TABLE_OR_COL l_shipdate)) (TOK_FUNCTION month (TOK_TABLE_OR_COL l_shipdate))))) thisyear) (= (. (TOK_TABLE_OR_COL lastyear) month) (. (TOK_TABLE_OR_COL thisyear) month)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL lastyear) month)) (TOK_SELEXPR (. (TOK_TABLE_OR_COL thisyear) month)) (TOK_SELEXPR (/ (- (. (TOK_TABLE_OR_COL thisyear) monthly_shipments) (. (TOK_TABLE_OR_COL lastyear) monthly_shipments)) (. (TOK_TABLE_OR_COL lastyear) monthly_shipments)) monthly_shipments_delta)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1, Stage-3 + Stage-3 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + lastyear:default__lineitem_lineitem_lshipdate_idx__ + TableScan + alias: default__lineitem_lineitem_lshipdate_idx__ + Filter Operator + predicate: + expr: (year(l_shipdate) = 1997) + type: boolean + Select Operator + expressions: + expr: l_shipdate + type: string + expr: _countkey + type: int + outputColumnNames: _col0, _col1 + Filter Operator + predicate: + expr: (year(_col0) = 1997) + type: boolean + Select Operator + expressions: + expr: _col1 + type: int + expr: _col0 + type: string + outputColumnNames: _col0, _col1 + Group By Operator + aggregations: + expr: sum(_col1) + bucketGroup: false + keys: + expr: year(_col0) + type: int + expr: month(_col0) + type: int + mode: hash + outputColumnNames: _col0, _col1, _col2 + Reduce Output Operator + key expressions: + expr: _col0 + type: int + expr: _col1 + type: int + sort order: ++ + Map-reduce partition columns: + expr: _col0 + type: int + expr: _col1 + type: int + tag: -1 + value expressions: + expr: _col2 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: sum(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: int + expr: KEY._col1 + type: int + mode: mergepartial + outputColumnNames: _col0, _col1, _col2 + Select Operator + expressions: + expr: _col1 + type: int + expr: _col2 + type: bigint + outputColumnNames: _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: + $INTNAME + Reduce Output Operator + key expressions: + expr: _col1 + type: int + sort order: + + Map-reduce partition columns: + expr: _col1 + type: int + tag: 0 + value expressions: + expr: _col1 + type: int + expr: _col2 + type: bigint + $INTNAME1 + Reduce Output Operator + key expressions: + expr: _col1 + type: int + sort order: + + Map-reduce partition columns: + expr: _col1 + type: int + tag: 1 + value expressions: + expr: _col1 + type: int + expr: _col2 + type: bigint + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {VALUE._col1} {VALUE._col2} + 1 {VALUE._col1} {VALUE._col2} + handleSkewJoin: false + outputColumnNames: _col1, _col2, _col4, _col5 + Select Operator + expressions: + expr: _col1 + type: int + expr: _col4 + type: int + expr: ((_col5 - _col2) / _col2) + type: double + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-3 + Map Reduce + Alias -> Map Operator Tree: + thisyear:default__lineitem_lineitem_lshipdate_idx__ + TableScan + alias: default__lineitem_lineitem_lshipdate_idx__ + Filter Operator + predicate: + expr: (year(l_shipdate) = 1998) + type: boolean + Select Operator + expressions: + expr: l_shipdate + type: string + expr: _countkey + type: int + outputColumnNames: _col0, _col1 + Filter Operator + predicate: + expr: (year(_col0) = 1998) + type: boolean + Select Operator + expressions: + expr: _col1 + type: int + expr: _col0 + type: string + outputColumnNames: _col0, _col1 + Group By Operator + aggregations: + expr: sum(_col1) + bucketGroup: false + keys: + expr: year(_col0) + type: int + expr: month(_col0) + type: int + mode: hash + outputColumnNames: _col0, _col1, _col2 + Reduce Output Operator + key expressions: + expr: _col0 + type: int + expr: _col1 + type: int + sort order: ++ + Map-reduce partition columns: + expr: _col0 + type: int + expr: _col1 + type: int + tag: -1 + value expressions: + expr: _col2 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: sum(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: int + expr: KEY._col1 + type: int + mode: mergepartial + outputColumnNames: _col0, _col1, _col2 + Select Operator + expressions: + expr: _col1 + type: int + expr: _col2 + type: bigint + outputColumnNames: _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: explain select year(l_shipdate) as year, + month(l_shipdate) as month, + count(1) as monthly_shipments +from lineitem +group by year(l_shipdate), month(l_shipdate) +PREHOOK: type: QUERY +POSTHOOK: query: explain select year(l_shipdate) as year, + month(l_shipdate) as month, + count(1) as monthly_shipments +from lineitem +group by year(l_shipdate), month(l_shipdate) +POSTHOOK: type: QUERY +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._countall EXPRESSION [(lineitem)lineitem.null, ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._countkey EXPRESSION [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME lineitem))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTION year (TOK_TABLE_OR_COL l_shipdate)) year) (TOK_SELEXPR (TOK_FUNCTION month (TOK_TABLE_OR_COL l_shipdate)) month) (TOK_SELEXPR (TOK_FUNCTION count 1) monthly_shipments)) (TOK_GROUPBY (TOK_FUNCTION year (TOK_TABLE_OR_COL l_shipdate)) (TOK_FUNCTION month (TOK_TABLE_OR_COL l_shipdate))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + default__lineitem_lineitem_lshipdate_idx__ + TableScan + alias: default__lineitem_lineitem_lshipdate_idx__ + Select Operator + expressions: + expr: l_shipdate + type: string + expr: _countkey + type: int + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col1 + type: int + expr: _col0 + type: string + outputColumnNames: _col0, _col1 + Group By Operator + aggregations: + expr: sum(_col1) + bucketGroup: false + keys: + expr: year(_col0) + type: int + expr: month(_col0) + type: int + mode: hash + outputColumnNames: _col0, _col1, _col2 + Reduce Output Operator + key expressions: + expr: _col0 + type: int + expr: _col1 + type: int + sort order: ++ + Map-reduce partition columns: + expr: _col0 + type: int + expr: _col1 + type: int + tag: -1 + value expressions: + expr: _col2 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: sum(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: int + expr: KEY._col1 + type: int + mode: mergepartial + outputColumnNames: _col0, _col1, _col2 + Select Operator + expressions: + expr: _col0 + type: int + expr: _col1 + type: int + expr: _col2 + type: bigint + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: explain select year(l_shipdate) as year, + month(l_shipdate) as month, + sum(sz) +from ( +select l_shipdate, `_countkey` as sz +from default__lineitem_lineitem_lshipdate_idx__ +) t +group by year(l_shipdate), month(l_shipdate) +PREHOOK: type: QUERY +POSTHOOK: query: explain select year(l_shipdate) as year, + month(l_shipdate) as month, + sum(sz) +from ( +select l_shipdate, `_countkey` as sz +from default__lineitem_lineitem_lshipdate_idx__ +) t +group by year(l_shipdate), month(l_shipdate) +POSTHOOK: type: QUERY +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._countall EXPRESSION [(lineitem)lineitem.null, ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._countkey EXPRESSION [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME default__lineitem_lineitem_lshipdate_idx__))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL l_shipdate)) (TOK_SELEXPR (TOK_TABLE_OR_COL `_countkey`) sz)))) t)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTION year (TOK_TABLE_OR_COL l_shipdate)) year) (TOK_SELEXPR (TOK_FUNCTION month (TOK_TABLE_OR_COL l_shipdate)) month) (TOK_SELEXPR (TOK_FUNCTION sum (TOK_TABLE_OR_COL sz)))) (TOK_GROUPBY (TOK_FUNCTION year (TOK_TABLE_OR_COL l_shipdate)) (TOK_FUNCTION month (TOK_TABLE_OR_COL l_shipdate))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + t:default__lineitem_lineitem_lshipdate_idx__ + TableScan + alias: default__lineitem_lineitem_lshipdate_idx__ + Select Operator + expressions: + expr: l_shipdate + type: string + expr: _countkey + type: int + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: int + outputColumnNames: _col0, _col1 + Group By Operator + aggregations: + expr: sum(_col1) + bucketGroup: false + keys: + expr: year(_col0) + type: int + expr: month(_col0) + type: int + mode: hash + outputColumnNames: _col0, _col1, _col2 + Reduce Output Operator + key expressions: + expr: _col0 + type: int + expr: _col1 + type: int + sort order: ++ + Map-reduce partition columns: + expr: _col0 + type: int + expr: _col1 + type: int + tag: -1 + value expressions: + expr: _col2 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: sum(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: int + expr: KEY._col1 + type: int + mode: mergepartial + outputColumnNames: _col0, _col1, _col2 + Select Operator + expressions: + expr: _col0 + type: int + expr: _col1 + type: int + expr: _col2 + type: bigint + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: explain select year(l_shipdate) as year, + month(l_shipdate) as month, + count(1) as monthly_shipments +from lineitem +group by year(l_shipdate), month(l_shipdate) +PREHOOK: type: QUERY +POSTHOOK: query: explain select year(l_shipdate) as year, + month(l_shipdate) as month, + count(1) as monthly_shipments +from lineitem +group by year(l_shipdate), month(l_shipdate) +POSTHOOK: type: QUERY +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._countall EXPRESSION [(lineitem)lineitem.null, ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._countkey EXPRESSION [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME lineitem))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTION year (TOK_TABLE_OR_COL l_shipdate)) year) (TOK_SELEXPR (TOK_FUNCTION month (TOK_TABLE_OR_COL l_shipdate)) month) (TOK_SELEXPR (TOK_FUNCTION count 1) monthly_shipments)) (TOK_GROUPBY (TOK_FUNCTION year (TOK_TABLE_OR_COL l_shipdate)) (TOK_FUNCTION month (TOK_TABLE_OR_COL l_shipdate))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + default__lineitem_lineitem_lshipdate_idx__ + TableScan + alias: default__lineitem_lineitem_lshipdate_idx__ + Select Operator + expressions: + expr: l_shipdate + type: string + expr: _countkey + type: int + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col1 + type: int + expr: _col0 + type: string + outputColumnNames: _col0, _col1 + Group By Operator + aggregations: + expr: sum(_col1) + bucketGroup: false + keys: + expr: year(_col0) + type: int + expr: month(_col0) + type: int + mode: hash + outputColumnNames: _col0, _col1, _col2 + Reduce Output Operator + key expressions: + expr: _col0 + type: int + expr: _col1 + type: int + sort order: ++ + Map-reduce partition columns: + expr: _col0 + type: int + expr: _col1 + type: int + tag: -1 + value expressions: + expr: _col2 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: sum(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: int + expr: KEY._col1 + type: int + mode: mergepartial + outputColumnNames: _col0, _col1, _col2 + Select Operator + expressions: + expr: _col0 + type: int + expr: _col1 + type: int + expr: _col2 + type: bigint + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: explain select year(l_shipdate) as year, + month(l_shipdate) as month, + sum(sz) +from ( +select l_shipdate, `_countkey` as sz +from default__lineitem_lineitem_lshipdate_idx__ +) t +group by year(l_shipdate), month(l_shipdate) +PREHOOK: type: QUERY +POSTHOOK: query: explain select year(l_shipdate) as year, + month(l_shipdate) as month, + sum(sz) +from ( +select l_shipdate, `_countkey` as sz +from default__lineitem_lineitem_lshipdate_idx__ +) t +group by year(l_shipdate), month(l_shipdate) +POSTHOOK: type: QUERY +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._countall EXPRESSION [(lineitem)lineitem.null, ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._countkey EXPRESSION [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME default__lineitem_lineitem_lshipdate_idx__))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL l_shipdate)) (TOK_SELEXPR (TOK_TABLE_OR_COL `_countkey`) sz)))) t)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTION year (TOK_TABLE_OR_COL l_shipdate)) year) (TOK_SELEXPR (TOK_FUNCTION month (TOK_TABLE_OR_COL l_shipdate)) month) (TOK_SELEXPR (TOK_FUNCTION sum (TOK_TABLE_OR_COL sz)))) (TOK_GROUPBY (TOK_FUNCTION year (TOK_TABLE_OR_COL l_shipdate)) (TOK_FUNCTION month (TOK_TABLE_OR_COL l_shipdate))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + t:default__lineitem_lineitem_lshipdate_idx__ + TableScan + alias: default__lineitem_lineitem_lshipdate_idx__ + Select Operator + expressions: + expr: l_shipdate + type: string + expr: _countkey + type: int + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: int + outputColumnNames: _col0, _col1 + Group By Operator + aggregations: + expr: sum(_col1) + bucketGroup: false + keys: + expr: year(_col0) + type: int + expr: month(_col0) + type: int + mode: hash + outputColumnNames: _col0, _col1, _col2 + Reduce Output Operator + key expressions: + expr: _col0 + type: int + expr: _col1 + type: int + sort order: ++ + Map-reduce partition columns: + expr: _col0 + type: int + expr: _col1 + type: int + tag: -1 + value expressions: + expr: _col2 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: sum(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: int + expr: KEY._col1 + type: int + mode: mergepartial + outputColumnNames: _col0, _col1, _col2 + Select Operator + expressions: + expr: _col0 + type: int + expr: _col1 + type: int + expr: _col2 + type: bigint + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: explain select year(L_SHIPDATE), month(L_SHIPDATE) as month_bkt, COUNT(1) + from lineitem +group by year(L_SHIPDATE), month(L_SHIPDATE) +PREHOOK: type: QUERY +POSTHOOK: query: explain select year(L_SHIPDATE), month(L_SHIPDATE) as month_bkt, COUNT(1) + from lineitem +group by year(L_SHIPDATE), month(L_SHIPDATE) +POSTHOOK: type: QUERY +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._countall EXPRESSION [(lineitem)lineitem.null, ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._countkey EXPRESSION [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME lineitem))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTION year (TOK_TABLE_OR_COL L_SHIPDATE))) (TOK_SELEXPR (TOK_FUNCTION month (TOK_TABLE_OR_COL L_SHIPDATE)) month_bkt) (TOK_SELEXPR (TOK_FUNCTION COUNT 1))) (TOK_GROUPBY (TOK_FUNCTION year (TOK_TABLE_OR_COL L_SHIPDATE)) (TOK_FUNCTION month (TOK_TABLE_OR_COL L_SHIPDATE))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + default__lineitem_lineitem_lshipdate_idx__ + TableScan + alias: default__lineitem_lineitem_lshipdate_idx__ + Select Operator + expressions: + expr: l_shipdate + type: string + expr: _countkey + type: int + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col1 + type: int + expr: _col0 + type: string + outputColumnNames: _col0, _col1 + Group By Operator + aggregations: + expr: sum(_col1) + bucketGroup: false + keys: + expr: year(_col0) + type: int + expr: month(_col0) + type: int + mode: hash + outputColumnNames: _col0, _col1, _col2 + Reduce Output Operator + key expressions: + expr: _col0 + type: int + expr: _col1 + type: int + sort order: ++ + Map-reduce partition columns: + expr: _col0 + type: int + expr: _col1 + type: int + tag: -1 + value expressions: + expr: _col2 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: sum(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: int + expr: KEY._col1 + type: int + mode: mergepartial + outputColumnNames: _col0, _col1, _col2 + Select Operator + expressions: + expr: _col0 + type: int + expr: _col1 + type: int + expr: _col2 + type: bigint + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: explain select lastyear.month, + thisyear.month, + (thisyear.monthly_shipments - lastyear.monthly_shipments) / +lastyear.monthly_shipments as monthly_shipments_delta + from (select year(l_shipdate) as year, + month(l_shipdate) as month, + count(1) as monthly_shipments + from lineitem + where year(l_shipdate) = 1997 + group by year(l_shipdate), month(l_shipdate) + ) lastyear join + (select year(l_shipdate) as year, + month(l_shipdate) as month, + count(1) as monthly_shipments + from lineitem + where year(l_shipdate) = 1998 + group by year(l_shipdate), month(l_shipdate) + ) thisyear + on lastyear.month = thisyear.month + and lastyear.year = thisyear.year +PREHOOK: type: QUERY +POSTHOOK: query: explain select lastyear.month, + thisyear.month, + (thisyear.monthly_shipments - lastyear.monthly_shipments) / +lastyear.monthly_shipments as monthly_shipments_delta + from (select year(l_shipdate) as year, + month(l_shipdate) as month, + count(1) as monthly_shipments + from lineitem + where year(l_shipdate) = 1997 + group by year(l_shipdate), month(l_shipdate) + ) lastyear join + (select year(l_shipdate) as year, + month(l_shipdate) as month, + count(1) as monthly_shipments + from lineitem + where year(l_shipdate) = 1998 + group by year(l_shipdate), month(l_shipdate) + ) thisyear + on lastyear.month = thisyear.month + and lastyear.year = thisyear.year +POSTHOOK: type: QUERY +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._countall EXPRESSION [(lineitem)lineitem.null, ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._countkey EXPRESSION [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME lineitem))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTION year (TOK_TABLE_OR_COL l_shipdate)) year) (TOK_SELEXPR (TOK_FUNCTION month (TOK_TABLE_OR_COL l_shipdate)) month) (TOK_SELEXPR (TOK_FUNCTION count 1) monthly_shipments)) (TOK_WHERE (= (TOK_FUNCTION year (TOK_TABLE_OR_COL l_shipdate)) 1997)) (TOK_GROUPBY (TOK_FUNCTION year (TOK_TABLE_OR_COL l_shipdate)) (TOK_FUNCTION month (TOK_TABLE_OR_COL l_shipdate))))) lastyear) (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME lineitem))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTION year (TOK_TABLE_OR_COL l_shipdate)) year) (TOK_SELEXPR (TOK_FUNCTION month (TOK_TABLE_OR_COL l_shipdate)) month) (TOK_SELEXPR (TOK_FUNCTION count 1) monthly_shipments)) (TOK_WHERE (= (TOK_FUNCTION year (TOK_TABLE_OR_COL l_shipdate)) 1998)) (TOK_GROUPBY (TOK_FUNCTION year (TOK_TABLE_OR_COL l_shipdate)) (TOK_FUNCTION month (TOK_TABLE_OR_COL l_shipdate))))) thisyear) (and (= (. (TOK_TABLE_OR_COL lastyear) month) (. (TOK_TABLE_OR_COL thisyear) month)) (= (. (TOK_TABLE_OR_COL lastyear) year) (. (TOK_TABLE_OR_COL thisyear) year))))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL lastyear) month)) (TOK_SELEXPR (. (TOK_TABLE_OR_COL thisyear) month)) (TOK_SELEXPR (/ (- (. (TOK_TABLE_OR_COL thisyear) monthly_shipments) (. (TOK_TABLE_OR_COL lastyear) monthly_shipments)) (. (TOK_TABLE_OR_COL lastyear) monthly_shipments)) monthly_shipments_delta)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1, Stage-3 + Stage-3 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + lastyear:default__lineitem_lineitem_lshipdate_idx__ + TableScan + alias: default__lineitem_lineitem_lshipdate_idx__ + Filter Operator + predicate: + expr: (year(l_shipdate) = 1997) + type: boolean + Select Operator + expressions: + expr: l_shipdate + type: string + expr: _countkey + type: int + outputColumnNames: _col0, _col1 + Filter Operator + predicate: + expr: (year(_col0) = 1997) + type: boolean + Select Operator + expressions: + expr: _col1 + type: int + expr: _col0 + type: string + outputColumnNames: _col0, _col1 + Group By Operator + aggregations: + expr: sum(_col1) + bucketGroup: false + keys: + expr: year(_col0) + type: int + expr: month(_col0) + type: int + mode: hash + outputColumnNames: _col0, _col1, _col2 + Reduce Output Operator + key expressions: + expr: _col0 + type: int + expr: _col1 + type: int + sort order: ++ + Map-reduce partition columns: + expr: _col0 + type: int + expr: _col1 + type: int + tag: -1 + value expressions: + expr: _col2 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: sum(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: int + expr: KEY._col1 + type: int + mode: mergepartial + outputColumnNames: _col0, _col1, _col2 + Select Operator + expressions: + expr: _col0 + type: int + expr: _col1 + type: int + expr: _col2 + type: bigint + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: + $INTNAME + Reduce Output Operator + key expressions: + expr: _col1 + type: int + expr: _col0 + type: int + sort order: ++ + Map-reduce partition columns: + expr: _col1 + type: int + expr: _col0 + type: int + tag: 0 + value expressions: + expr: _col1 + type: int + expr: _col2 + type: bigint + $INTNAME1 + Reduce Output Operator + key expressions: + expr: _col1 + type: int + expr: _col0 + type: int + sort order: ++ + Map-reduce partition columns: + expr: _col1 + type: int + expr: _col0 + type: int + tag: 1 + value expressions: + expr: _col1 + type: int + expr: _col2 + type: bigint + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {VALUE._col1} {VALUE._col2} + 1 {VALUE._col1} {VALUE._col2} + handleSkewJoin: false + outputColumnNames: _col1, _col2, _col4, _col5 + Select Operator + expressions: + expr: _col1 + type: int + expr: _col4 + type: int + expr: ((_col5 - _col2) / _col2) + type: double + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-3 + Map Reduce + Alias -> Map Operator Tree: + thisyear:default__lineitem_lineitem_lshipdate_idx__ + TableScan + alias: default__lineitem_lineitem_lshipdate_idx__ + Filter Operator + predicate: + expr: (year(l_shipdate) = 1998) + type: boolean + Select Operator + expressions: + expr: l_shipdate + type: string + expr: _countkey + type: int + outputColumnNames: _col0, _col1 + Filter Operator + predicate: + expr: (year(_col0) = 1998) + type: boolean + Select Operator + expressions: + expr: _col1 + type: int + expr: _col0 + type: string + outputColumnNames: _col0, _col1 + Group By Operator + aggregations: + expr: sum(_col1) + bucketGroup: false + keys: + expr: year(_col0) + type: int + expr: month(_col0) + type: int + mode: hash + outputColumnNames: _col0, _col1, _col2 + Reduce Output Operator + key expressions: + expr: _col0 + type: int + expr: _col1 + type: int + sort order: ++ + Map-reduce partition columns: + expr: _col0 + type: int + expr: _col1 + type: int + tag: -1 + value expressions: + expr: _col2 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: sum(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: int + expr: KEY._col1 + type: int + mode: mergepartial + outputColumnNames: _col0, _col1, _col2 + Select Operator + expressions: + expr: _col0 + type: int + expr: _col1 + type: int + expr: _col2 + type: bigint + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: DROP TABLE tbl +PREHOOK: type: DROPTABLE +POSTHOOK: query: DROP TABLE tbl +POSTHOOK: type: DROPTABLE +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._countall EXPRESSION [(lineitem)lineitem.null, ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._countkey EXPRESSION [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +PREHOOK: query: CREATE TABLE tbl(key int, value int) +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE tbl(key int, value int) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@tbl +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._countall EXPRESSION [(lineitem)lineitem.null, ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._countkey EXPRESSION [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +PREHOOK: query: CREATE INDEX tbl_key_idx ON TABLE tbl(key) AS 'org.apache.hadoop.hive.ql.index.AggregateIndexHandler' WITH DEFERRED REBUILD +PREHOOK: type: CREATEINDEX +POSTHOOK: query: CREATE INDEX tbl_key_idx ON TABLE tbl(key) AS 'org.apache.hadoop.hive.ql.index.AggregateIndexHandler' WITH DEFERRED REBUILD +POSTHOOK: type: CREATEINDEX +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._countall EXPRESSION [(lineitem)lineitem.null, ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._countkey EXPRESSION [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +PREHOOK: query: ALTER INDEX tbl_key_idx ON tbl REBUILD +PREHOOK: type: QUERY +PREHOOK: Input: default@tbl +PREHOOK: Output: default@default__tbl_tbl_key_idx__ +POSTHOOK: query: ALTER INDEX tbl_key_idx ON tbl REBUILD +POSTHOOK: type: QUERY +POSTHOOK: Input: default@tbl +POSTHOOK: Output: default@default__tbl_tbl_key_idx__ +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._countall EXPRESSION [(lineitem)lineitem.null, ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._countkey EXPRESSION [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._bucketname SIMPLE [(tbl)tbl.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._countall EXPRESSION [(tbl)tbl.null, ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._countkey EXPRESSION [(tbl)tbl.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._offsets EXPRESSION [(tbl)tbl.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__.key SIMPLE [(tbl)tbl.FieldSchema(name:key, type:int, comment:null), ] +PREHOOK: query: EXPLAIN select key, count(key) from tbl where key = 1 group by key +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN select key, count(key) from tbl where key = 1 group by key +POSTHOOK: type: QUERY +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._countall EXPRESSION [(lineitem)lineitem.null, ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._countkey EXPRESSION [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._bucketname SIMPLE [(tbl)tbl.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._countall EXPRESSION [(tbl)tbl.null, ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._countkey EXPRESSION [(tbl)tbl.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._offsets EXPRESSION [(tbl)tbl.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__.key SIMPLE [(tbl)tbl.FieldSchema(name:key, type:int, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME tbl))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count (TOK_TABLE_OR_COL key)))) (TOK_WHERE (= (TOK_TABLE_OR_COL key) 1)) (TOK_GROUPBY (TOK_TABLE_OR_COL key)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + default__tbl_tbl_key_idx__ + TableScan + Filter Operator + predicate: + expr: (key = 1) + type: boolean + Filter Operator + predicate: + expr: (key = 1) + type: boolean + Select Operator + expressions: + expr: key + type: int + expr: _countkey + type: int + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: EXPLAIN SELECT DISTINCT key FROM tbl +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN SELECT DISTINCT key FROM tbl +POSTHOOK: type: QUERY +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._countall EXPRESSION [(lineitem)lineitem.null, ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._countkey EXPRESSION [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._bucketname SIMPLE [(tbl)tbl.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._countall EXPRESSION [(tbl)tbl.null, ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._countkey EXPRESSION [(tbl)tbl.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._offsets EXPRESSION [(tbl)tbl.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__.key SIMPLE [(tbl)tbl.FieldSchema(name:key, type:int, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME tbl))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECTDI (TOK_SELEXPR (TOK_TABLE_OR_COL key))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + default__tbl_tbl_key_idx__ + TableScan + Select Operator + expressions: + expr: key + type: int + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: EXPLAIN select count(1) from tbl +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN select count(1) from tbl +POSTHOOK: type: QUERY +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._countall EXPRESSION [(lineitem)lineitem.null, ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._countkey EXPRESSION [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._bucketname SIMPLE [(tbl)tbl.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._countall EXPRESSION [(tbl)tbl.null, ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._countkey EXPRESSION [(tbl)tbl.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._offsets EXPRESSION [(tbl)tbl.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__.key SIMPLE [(tbl)tbl.FieldSchema(name:key, type:int, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME tbl))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTION count 1))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + tbl + TableScan + alias: tbl + Select Operator + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: bigint + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: EXPLAIN select key, count(key) from tbl group by key +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN select key, count(key) from tbl group by key +POSTHOOK: type: QUERY +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._countall EXPRESSION [(lineitem)lineitem.null, ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._countkey EXPRESSION [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._bucketname SIMPLE [(tbl)tbl.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._countall EXPRESSION [(tbl)tbl.null, ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._countkey EXPRESSION [(tbl)tbl.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._offsets EXPRESSION [(tbl)tbl.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__.key SIMPLE [(tbl)tbl.FieldSchema(name:key, type:int, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME tbl))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count (TOK_TABLE_OR_COL key)))) (TOK_GROUPBY (TOK_TABLE_OR_COL key)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + default__tbl_tbl_key_idx__ + TableScan + Select Operator + expressions: + expr: key + type: int + expr: _countkey + type: int + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: EXPLAIN select count(key) from tbl +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN select count(key) from tbl +POSTHOOK: type: QUERY +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._countall EXPRESSION [(lineitem)lineitem.null, ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._countkey EXPRESSION [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._bucketname SIMPLE [(tbl)tbl.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._countall EXPRESSION [(tbl)tbl.null, ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._countkey EXPRESSION [(tbl)tbl.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._offsets EXPRESSION [(tbl)tbl.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__.key SIMPLE [(tbl)tbl.FieldSchema(name:key, type:int, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME tbl))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTION count (TOK_TABLE_OR_COL key)))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + tbl + TableScan + alias: tbl + Select Operator + expressions: + expr: key + type: int + outputColumnNames: key + Group By Operator + aggregations: + expr: count(key) + bucketGroup: false + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: bigint + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: EXPLAIN SELECT DISTINCT key FROM tbl +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN SELECT DISTINCT key FROM tbl +POSTHOOK: type: QUERY +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._countall EXPRESSION [(lineitem)lineitem.null, ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._countkey EXPRESSION [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._bucketname SIMPLE [(tbl)tbl.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._countall EXPRESSION [(tbl)tbl.null, ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._countkey EXPRESSION [(tbl)tbl.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._offsets EXPRESSION [(tbl)tbl.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__.key SIMPLE [(tbl)tbl.FieldSchema(name:key, type:int, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME tbl))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECTDI (TOK_SELEXPR (TOK_TABLE_OR_COL key))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + default__tbl_tbl_key_idx__ + TableScan + Select Operator + expressions: + expr: key + type: int + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: EXPLAIN SELECT key FROM tbl GROUP BY key +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN SELECT key FROM tbl GROUP BY key +POSTHOOK: type: QUERY +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._countall EXPRESSION [(lineitem)lineitem.null, ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._countkey EXPRESSION [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._bucketname SIMPLE [(tbl)tbl.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._countall EXPRESSION [(tbl)tbl.null, ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._countkey EXPRESSION [(tbl)tbl.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._offsets EXPRESSION [(tbl)tbl.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__.key SIMPLE [(tbl)tbl.FieldSchema(name:key, type:int, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME tbl))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key))) (TOK_GROUPBY (TOK_TABLE_OR_COL key)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + default__tbl_tbl_key_idx__ + TableScan + Select Operator + expressions: + expr: key + type: int + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: EXPLAIN SELECT DISTINCT key FROM tbl +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN SELECT DISTINCT key FROM tbl +POSTHOOK: type: QUERY +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._countall EXPRESSION [(lineitem)lineitem.null, ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._countkey EXPRESSION [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._bucketname SIMPLE [(tbl)tbl.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._countall EXPRESSION [(tbl)tbl.null, ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._countkey EXPRESSION [(tbl)tbl.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._offsets EXPRESSION [(tbl)tbl.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__.key SIMPLE [(tbl)tbl.FieldSchema(name:key, type:int, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME tbl))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECTDI (TOK_SELEXPR (TOK_TABLE_OR_COL key))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + default__tbl_tbl_key_idx__ + TableScan + Select Operator + expressions: + expr: key + type: int + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: EXPLAIN SELECT DISTINCT key, value FROM tbl +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN SELECT DISTINCT key, value FROM tbl +POSTHOOK: type: QUERY +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._countall EXPRESSION [(lineitem)lineitem.null, ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._countkey EXPRESSION [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._bucketname SIMPLE [(tbl)tbl.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._countall EXPRESSION [(tbl)tbl.null, ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._countkey EXPRESSION [(tbl)tbl.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._offsets EXPRESSION [(tbl)tbl.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__.key SIMPLE [(tbl)tbl.FieldSchema(name:key, type:int, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME tbl))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECTDI (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL value))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + tbl + TableScan + alias: tbl + Select Operator + expressions: + expr: key + type: int + expr: value + type: int + outputColumnNames: key, value + Group By Operator + bucketGroup: false + keys: + expr: key + type: int + expr: value + type: int + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: int + expr: _col1 + type: int + sort order: ++ + Map-reduce partition columns: + expr: _col0 + type: int + expr: _col1 + type: int + tag: -1 + Reduce Operator Tree: + Group By Operator + bucketGroup: false + keys: + expr: KEY._col0 + type: int + expr: KEY._col1 + type: int + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: int + expr: _col1 + type: int + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: EXPLAIN SELECT key FROM tbl GROUP BY key +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN SELECT key FROM tbl GROUP BY key +POSTHOOK: type: QUERY +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._countall EXPRESSION [(lineitem)lineitem.null, ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._countkey EXPRESSION [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._bucketname SIMPLE [(tbl)tbl.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._countall EXPRESSION [(tbl)tbl.null, ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._countkey EXPRESSION [(tbl)tbl.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._offsets EXPRESSION [(tbl)tbl.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__.key SIMPLE [(tbl)tbl.FieldSchema(name:key, type:int, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME tbl))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key))) (TOK_GROUPBY (TOK_TABLE_OR_COL key)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + default__tbl_tbl_key_idx__ + TableScan + Select Operator + expressions: + expr: key + type: int + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: EXPLAIN SELECT key FROM tbl GROUP BY value, key +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN SELECT key FROM tbl GROUP BY value, key +POSTHOOK: type: QUERY +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._countall EXPRESSION [(lineitem)lineitem.null, ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._countkey EXPRESSION [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._bucketname SIMPLE [(tbl)tbl.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._countall EXPRESSION [(tbl)tbl.null, ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._countkey EXPRESSION [(tbl)tbl.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._offsets EXPRESSION [(tbl)tbl.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__.key SIMPLE [(tbl)tbl.FieldSchema(name:key, type:int, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME tbl))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key))) (TOK_GROUPBY (TOK_TABLE_OR_COL value) (TOK_TABLE_OR_COL key)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + tbl + TableScan + alias: tbl + Select Operator + expressions: + expr: value + type: int + expr: key + type: int + outputColumnNames: value, key + Group By Operator + bucketGroup: false + keys: + expr: value + type: int + expr: key + type: int + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: int + expr: _col1 + type: int + sort order: ++ + Map-reduce partition columns: + expr: _col0 + type: int + expr: _col1 + type: int + tag: -1 + Reduce Operator Tree: + Group By Operator + bucketGroup: false + keys: + expr: KEY._col0 + type: int + expr: KEY._col1 + type: int + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col1 + type: int + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: EXPLAIN SELECT key, value FROM tbl GROUP BY value, key +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN SELECT key, value FROM tbl GROUP BY value, key +POSTHOOK: type: QUERY +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._countall EXPRESSION [(lineitem)lineitem.null, ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._countkey EXPRESSION [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._bucketname SIMPLE [(tbl)tbl.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._countall EXPRESSION [(tbl)tbl.null, ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._countkey EXPRESSION [(tbl)tbl.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._offsets EXPRESSION [(tbl)tbl.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__.key SIMPLE [(tbl)tbl.FieldSchema(name:key, type:int, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME tbl))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL value))) (TOK_GROUPBY (TOK_TABLE_OR_COL value) (TOK_TABLE_OR_COL key)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + tbl + TableScan + alias: tbl + Select Operator + expressions: + expr: value + type: int + expr: key + type: int + outputColumnNames: value, key + Group By Operator + bucketGroup: false + keys: + expr: value + type: int + expr: key + type: int + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: int + expr: _col1 + type: int + sort order: ++ + Map-reduce partition columns: + expr: _col0 + type: int + expr: _col1 + type: int + tag: -1 + Reduce Operator Tree: + Group By Operator + bucketGroup: false + keys: + expr: KEY._col0 + type: int + expr: KEY._col1 + type: int + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col1 + type: int + expr: _col0 + type: int + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: EXPLAIN SELECT DISTINCT key, value FROM tbl WHERE value = 2 +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN SELECT DISTINCT key, value FROM tbl WHERE value = 2 +POSTHOOK: type: QUERY +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._countall EXPRESSION [(lineitem)lineitem.null, ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._countkey EXPRESSION [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._bucketname SIMPLE [(tbl)tbl.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._countall EXPRESSION [(tbl)tbl.null, ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._countkey EXPRESSION [(tbl)tbl.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._offsets EXPRESSION [(tbl)tbl.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__.key SIMPLE [(tbl)tbl.FieldSchema(name:key, type:int, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME tbl))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECTDI (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL value))) (TOK_WHERE (= (TOK_TABLE_OR_COL value) 2)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + tbl + TableScan + alias: tbl + Filter Operator + predicate: + expr: (value = 2) + type: boolean + Filter Operator + predicate: + expr: (value = 2) + type: boolean + Select Operator + expressions: + expr: key + type: int + expr: value + type: int + outputColumnNames: key, value + Group By Operator + bucketGroup: false + keys: + expr: key + type: int + expr: value + type: int + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: int + expr: _col1 + type: int + sort order: ++ + Map-reduce partition columns: + expr: _col0 + type: int + expr: _col1 + type: int + tag: -1 + Reduce Operator Tree: + Group By Operator + bucketGroup: false + keys: + expr: KEY._col0 + type: int + expr: KEY._col1 + type: int + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: int + expr: _col1 + type: int + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: EXPLAIN SELECT DISTINCT key, value FROM tbl WHERE value = 2 AND key = 3 +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN SELECT DISTINCT key, value FROM tbl WHERE value = 2 AND key = 3 +POSTHOOK: type: QUERY +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._countall EXPRESSION [(lineitem)lineitem.null, ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._countkey EXPRESSION [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._bucketname SIMPLE [(tbl)tbl.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._countall EXPRESSION [(tbl)tbl.null, ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._countkey EXPRESSION [(tbl)tbl.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._offsets EXPRESSION [(tbl)tbl.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__.key SIMPLE [(tbl)tbl.FieldSchema(name:key, type:int, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME tbl))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECTDI (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL value))) (TOK_WHERE (AND (= (TOK_TABLE_OR_COL value) 2) (= (TOK_TABLE_OR_COL key) 3))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + tbl + TableScan + alias: tbl + Filter Operator + predicate: + expr: ((value = 2) and (key = 3)) + type: boolean + Filter Operator + predicate: + expr: ((value = 2) and (key = 3)) + type: boolean + Select Operator + expressions: + expr: key + type: int + expr: value + type: int + outputColumnNames: key, value + Group By Operator + bucketGroup: false + keys: + expr: key + type: int + expr: value + type: int + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: int + expr: _col1 + type: int + sort order: ++ + Map-reduce partition columns: + expr: _col0 + type: int + expr: _col1 + type: int + tag: -1 + Reduce Operator Tree: + Group By Operator + bucketGroup: false + keys: + expr: KEY._col0 + type: int + expr: KEY._col1 + type: int + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: int + expr: _col1 + type: int + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: EXPLAIN SELECT DISTINCT key, value FROM tbl WHERE value = key +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN SELECT DISTINCT key, value FROM tbl WHERE value = key +POSTHOOK: type: QUERY +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._countall EXPRESSION [(lineitem)lineitem.null, ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._countkey EXPRESSION [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._bucketname SIMPLE [(tbl)tbl.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._countall EXPRESSION [(tbl)tbl.null, ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._countkey EXPRESSION [(tbl)tbl.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._offsets EXPRESSION [(tbl)tbl.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__.key SIMPLE [(tbl)tbl.FieldSchema(name:key, type:int, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME tbl))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECTDI (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL value))) (TOK_WHERE (= (TOK_TABLE_OR_COL value) (TOK_TABLE_OR_COL key))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + tbl + TableScan + alias: tbl + Filter Operator + predicate: + expr: (value = key) + type: boolean + Filter Operator + predicate: + expr: (value = key) + type: boolean + Select Operator + expressions: + expr: key + type: int + expr: value + type: int + outputColumnNames: key, value + Group By Operator + bucketGroup: false + keys: + expr: key + type: int + expr: value + type: int + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: int + expr: _col1 + type: int + sort order: ++ + Map-reduce partition columns: + expr: _col0 + type: int + expr: _col1 + type: int + tag: -1 + Reduce Operator Tree: + Group By Operator + bucketGroup: false + keys: + expr: KEY._col0 + type: int + expr: KEY._col1 + type: int + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: int + expr: _col1 + type: int + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: EXPLAIN SELECT key FROM tbl WHERE key = 3 GROUP BY key +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN SELECT key FROM tbl WHERE key = 3 GROUP BY key +POSTHOOK: type: QUERY +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._countall EXPRESSION [(lineitem)lineitem.null, ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._countkey EXPRESSION [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._bucketname SIMPLE [(tbl)tbl.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._countall EXPRESSION [(tbl)tbl.null, ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._countkey EXPRESSION [(tbl)tbl.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._offsets EXPRESSION [(tbl)tbl.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__.key SIMPLE [(tbl)tbl.FieldSchema(name:key, type:int, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME tbl))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key))) (TOK_WHERE (= (TOK_TABLE_OR_COL key) 3)) (TOK_GROUPBY (TOK_TABLE_OR_COL key)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + default__tbl_tbl_key_idx__ + TableScan + Filter Operator + predicate: + expr: (key = 3) + type: boolean + Filter Operator + predicate: + expr: (key = 3) + type: boolean + Select Operator + expressions: + expr: key + type: int + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: EXPLAIN SELECT key, value FROM tbl WHERE value = 1 GROUP BY key, value +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN SELECT key, value FROM tbl WHERE value = 1 GROUP BY key, value +POSTHOOK: type: QUERY +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._countall EXPRESSION [(lineitem)lineitem.null, ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._countkey EXPRESSION [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._bucketname SIMPLE [(tbl)tbl.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._countall EXPRESSION [(tbl)tbl.null, ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._countkey EXPRESSION [(tbl)tbl.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._offsets EXPRESSION [(tbl)tbl.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__.key SIMPLE [(tbl)tbl.FieldSchema(name:key, type:int, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME tbl))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL value))) (TOK_WHERE (= (TOK_TABLE_OR_COL value) 1)) (TOK_GROUPBY (TOK_TABLE_OR_COL key) (TOK_TABLE_OR_COL value)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + tbl + TableScan + alias: tbl + Filter Operator + predicate: + expr: (value = 1) + type: boolean + Filter Operator + predicate: + expr: (value = 1) + type: boolean + Select Operator + expressions: + expr: key + type: int + expr: value + type: int + outputColumnNames: key, value + Group By Operator + bucketGroup: false + keys: + expr: key + type: int + expr: value + type: int + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: int + expr: _col1 + type: int + sort order: ++ + Map-reduce partition columns: + expr: _col0 + type: int + expr: _col1 + type: int + tag: -1 + Reduce Operator Tree: + Group By Operator + bucketGroup: false + keys: + expr: KEY._col0 + type: int + expr: KEY._col1 + type: int + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: int + expr: _col1 + type: int + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: EXPLAIN SELECT * FROM (SELECT DISTINCT key, value FROM tbl) v1 WHERE v1.value = 2 +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN SELECT * FROM (SELECT DISTINCT key, value FROM tbl) v1 WHERE v1.value = 2 +POSTHOOK: type: QUERY +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._countall EXPRESSION [(lineitem)lineitem.null, ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._countkey EXPRESSION [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._bucketname SIMPLE [(tbl)tbl.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._countall EXPRESSION [(tbl)tbl.null, ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._countkey EXPRESSION [(tbl)tbl.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._offsets EXPRESSION [(tbl)tbl.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__.key SIMPLE [(tbl)tbl.FieldSchema(name:key, type:int, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME tbl))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECTDI (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL value))))) v1)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)) (TOK_WHERE (= (. (TOK_TABLE_OR_COL v1) value) 2)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + v1:tbl + TableScan + alias: tbl + Filter Operator + predicate: + expr: (value = 2) + type: boolean + Select Operator + expressions: + expr: key + type: int + expr: value + type: int + outputColumnNames: key, value + Group By Operator + bucketGroup: false + keys: + expr: key + type: int + expr: value + type: int + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: int + expr: _col1 + type: int + sort order: ++ + Map-reduce partition columns: + expr: _col0 + type: int + expr: _col1 + type: int + tag: -1 + Reduce Operator Tree: + Group By Operator + bucketGroup: false + keys: + expr: KEY._col0 + type: int + expr: KEY._col1 + type: int + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: int + expr: _col1 + type: int + outputColumnNames: _col0, _col1 + Filter Operator + predicate: + expr: (_col1 = 2) + type: boolean + Select Operator + expressions: + expr: _col0 + type: int + expr: _col1 + type: int + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: EXPLAIN SELECT key FROM tbl WHERE value = 2 GROUP BY key +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN SELECT key FROM tbl WHERE value = 2 GROUP BY key +POSTHOOK: type: QUERY +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._countall EXPRESSION [(lineitem)lineitem.null, ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._countkey EXPRESSION [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._bucketname SIMPLE [(tbl)tbl.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._countall EXPRESSION [(tbl)tbl.null, ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._countkey EXPRESSION [(tbl)tbl.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._offsets EXPRESSION [(tbl)tbl.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__.key SIMPLE [(tbl)tbl.FieldSchema(name:key, type:int, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME tbl))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key))) (TOK_WHERE (= (TOK_TABLE_OR_COL value) 2)) (TOK_GROUPBY (TOK_TABLE_OR_COL key)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + tbl + TableScan + alias: tbl + Filter Operator + predicate: + expr: (value = 2) + type: boolean + Filter Operator + predicate: + expr: (value = 2) + type: boolean + Select Operator + expressions: + expr: key + type: int + outputColumnNames: key + Group By Operator + bucketGroup: false + keys: + expr: key + type: int + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + key expressions: + expr: _col0 + type: int + sort order: + + Map-reduce partition columns: + expr: _col0 + type: int + tag: -1 + Reduce Operator Tree: + Group By Operator + bucketGroup: false + keys: + expr: KEY._col0 + type: int + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: int + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: EXPLAIN SELECT DISTINCT key, substr(value,2,3) FROM tbl WHERE value = key +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN SELECT DISTINCT key, substr(value,2,3) FROM tbl WHERE value = key +POSTHOOK: type: QUERY +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._countall EXPRESSION [(lineitem)lineitem.null, ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._countkey EXPRESSION [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._bucketname SIMPLE [(tbl)tbl.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._countall EXPRESSION [(tbl)tbl.null, ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._countkey EXPRESSION [(tbl)tbl.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._offsets EXPRESSION [(tbl)tbl.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__.key SIMPLE [(tbl)tbl.FieldSchema(name:key, type:int, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME tbl))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECTDI (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION substr (TOK_TABLE_OR_COL value) 2 3))) (TOK_WHERE (= (TOK_TABLE_OR_COL value) (TOK_TABLE_OR_COL key))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + tbl + TableScan + alias: tbl + Filter Operator + predicate: + expr: (value = key) + type: boolean + Filter Operator + predicate: + expr: (value = key) + type: boolean + Select Operator + expressions: + expr: key + type: int + expr: value + type: int + outputColumnNames: key, value + Group By Operator + bucketGroup: false + keys: + expr: key + type: int + expr: substr(value, 2, 3) + type: string + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: int + expr: _col1 + type: string + sort order: ++ + Map-reduce partition columns: + expr: _col0 + type: int + expr: _col1 + type: string + tag: -1 + Reduce Operator Tree: + Group By Operator + bucketGroup: false + keys: + expr: KEY._col0 + type: int + expr: KEY._col1 + type: string + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: int + expr: _col1 + type: string + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: EXPLAIN SELECT DISTINCT key, substr(value,2,3) FROM tbl +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN SELECT DISTINCT key, substr(value,2,3) FROM tbl +POSTHOOK: type: QUERY +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._countall EXPRESSION [(lineitem)lineitem.null, ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._countkey EXPRESSION [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._bucketname SIMPLE [(tbl)tbl.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._countall EXPRESSION [(tbl)tbl.null, ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._countkey EXPRESSION [(tbl)tbl.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._offsets EXPRESSION [(tbl)tbl.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__.key SIMPLE [(tbl)tbl.FieldSchema(name:key, type:int, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME tbl))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECTDI (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION substr (TOK_TABLE_OR_COL value) 2 3))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + tbl + TableScan + alias: tbl + Select Operator + expressions: + expr: key + type: int + expr: value + type: int + outputColumnNames: key, value + Group By Operator + bucketGroup: false + keys: + expr: key + type: int + expr: substr(value, 2, 3) + type: string + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: int + expr: _col1 + type: string + sort order: ++ + Map-reduce partition columns: + expr: _col0 + type: int + expr: _col1 + type: string + tag: -1 + Reduce Operator Tree: + Group By Operator + bucketGroup: false + keys: + expr: KEY._col0 + type: int + expr: KEY._col1 + type: string + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: int + expr: _col1 + type: string + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: EXPLAIN SELECT key FROM tbl GROUP BY key, substr(key,2,3) +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN SELECT key FROM tbl GROUP BY key, substr(key,2,3) +POSTHOOK: type: QUERY +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._countall EXPRESSION [(lineitem)lineitem.null, ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._countkey EXPRESSION [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._bucketname SIMPLE [(tbl)tbl.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._countall EXPRESSION [(tbl)tbl.null, ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._countkey EXPRESSION [(tbl)tbl.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._offsets EXPRESSION [(tbl)tbl.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__.key SIMPLE [(tbl)tbl.FieldSchema(name:key, type:int, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME tbl))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key))) (TOK_GROUPBY (TOK_TABLE_OR_COL key) (TOK_FUNCTION substr (TOK_TABLE_OR_COL key) 2 3)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + tbl + TableScan + alias: tbl + Select Operator + expressions: + expr: key + type: int + outputColumnNames: key + Group By Operator + bucketGroup: false + keys: + expr: key + type: int + expr: substr(key, 2, 3) + type: string + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: int + expr: _col1 + type: string + sort order: ++ + Map-reduce partition columns: + expr: _col0 + type: int + expr: _col1 + type: string + tag: -1 + Reduce Operator Tree: + Group By Operator + bucketGroup: false + keys: + expr: KEY._col0 + type: int + expr: KEY._col1 + type: string + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: int + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: DROP TABLE tbl +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@tbl +PREHOOK: Output: default@tbl +POSTHOOK: query: DROP TABLE tbl +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@tbl +POSTHOOK: Output: default@tbl +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._bucketname SIMPLE [(lineitem)lineitem.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._countall EXPRESSION [(lineitem)lineitem.null, ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._countkey EXPRESSION [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__._offsets EXPRESSION [(lineitem)lineitem.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__lineitem_lineitem_lshipdate_idx__.l_shipdate SIMPLE [(lineitem)lineitem.FieldSchema(name:l_shipdate, type:string, comment:null), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._bucketname SIMPLE [(tbl)tbl.FieldSchema(name:INPUT__FILE__NAME, type:string, comment:), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._countall EXPRESSION [(tbl)tbl.null, ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._countkey EXPRESSION [(tbl)tbl.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__._offsets EXPRESSION [(tbl)tbl.FieldSchema(name:BLOCK__OFFSET__INSIDE__FILE, type:bigint, comment:), ] +POSTHOOK: Lineage: default__tbl_tbl_key_idx__.key SIMPLE [(tbl)tbl.FieldSchema(name:key, type:int, comment:null), ]