diff --git common/src/java/org/apache/hadoop/hive/conf/HiveConf.java common/src/java/org/apache/hadoop/hive/conf/HiveConf.java index 5d2e6b0..ff4e569 100644 --- common/src/java/org/apache/hadoop/hive/conf/HiveConf.java +++ common/src/java/org/apache/hadoop/hive/conf/HiveConf.java @@ -1186,13 +1186,6 @@ "Average row size is computed from average column size of all columns in the row. In the absence\n" + "of column statistics and for variable length complex columns like map, the average number of\n" + "entries/values can be specified using this config."), - // to accurately compute statistics for GROUPBY map side parallelism needs to be known - HIVE_STATS_MAP_SIDE_PARALLELISM("hive.stats.map.parallelism", 1, - "Hive/Tez optimizer estimates the data size flowing through each of the operators.\n" + - "For GROUPBY operator, to accurately compute the data size map-side parallelism needs to\n" + - "be known. By default, this value is set to 1 since optimizer is not aware of the number of\n" + - "mappers during compile-time. This Hive config can be used to specify the number of mappers\n" + - "to be used for data size computation of GROUPBY operator."), // statistics annotation fetches stats for each partition, which can be expensive. turning // this off will result in basic sizes being fetched from namenode instead HIVE_STATS_FETCH_PARTITION_STATS("hive.stats.fetch.partition.stats", true, diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/GroupByOperator.java ql/src/java/org/apache/hadoop/hive/ql/exec/GroupByOperator.java index 792d87f..50ffa56 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/GroupByOperator.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/GroupByOperator.java @@ -18,22 +18,7 @@ package org.apache.hadoop.hive.ql.exec; -import java.io.Serializable; -import java.lang.management.ManagementFactory; -import java.lang.management.MemoryMXBean; -import java.lang.reflect.Field; -import java.sql.Timestamp; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.HashMap; -import java.util.HashSet; -import java.util.Iterator; -import java.util.List; -import java.util.Map; -import java.util.Set; - import javolution.util.FastBitSet; - import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; @@ -69,6 +54,20 @@ import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.Text; +import java.io.Serializable; +import java.lang.management.ManagementFactory; +import java.lang.management.MemoryMXBean; +import java.lang.reflect.Field; +import java.sql.Timestamp; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Set; + /** * GroupBy operator implementation. */ @@ -442,10 +441,10 @@ private void computeMaxEntriesHashAggr(Configuration hconf) throws HiveException estimateRowSize(); } - private static final int javaObjectOverHead = 64; - private static final int javaHashEntryOverHead = 64; - private static final int javaSizePrimitiveType = 16; - private static final int javaSizeUnknownType = 256; + public static final int javaObjectOverHead = 64; + public static final int javaHashEntryOverHead = 64; + public static final int javaSizePrimitiveType = 16; + public static final int javaSizeUnknownType = 256; /** * The size of the element at position 'pos' is returned, if possible. If the diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/tez/DagUtils.java ql/src/java/org/apache/hadoop/hive/ql/exec/tez/DagUtils.java index 4ff568d1..1db0e76 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/tez/DagUtils.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/tez/DagUtils.java @@ -393,7 +393,7 @@ private EdgeProperty createEdgeProperty(TezEdgeProperty edgeProp, Configuration * from yarn. Falls back to Map-reduce's map size if tez * container size isn't set. */ - private Resource getContainerResource(Configuration conf) { + public static Resource getContainerResource(Configuration conf) { int memory = HiveConf.getIntVar(conf, HiveConf.ConfVars.HIVETEZCONTAINERSIZE) > 0 ? HiveConf.getIntVar(conf, HiveConf.ConfVars.HIVETEZCONTAINERSIZE) : conf.getInt(MRJobConfig.MAP_MEMORY_MB, MRJobConfig.DEFAULT_MAP_MEMORY_MB); diff --git ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java index 13d1f88..fc88cef 100644 --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java @@ -31,10 +31,12 @@ import org.apache.hadoop.hive.ql.exec.GroupByOperator; import org.apache.hadoop.hive.ql.exec.LimitOperator; import org.apache.hadoop.hive.ql.exec.Operator; +import org.apache.hadoop.hive.ql.exec.OperatorUtils; import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator; import org.apache.hadoop.hive.ql.exec.RowSchema; import org.apache.hadoop.hive.ql.exec.SelectOperator; import org.apache.hadoop.hive.ql.exec.TableScanOperator; +import org.apache.hadoop.hive.ql.exec.tez.DagUtils; import org.apache.hadoop.hive.ql.lib.Node; import org.apache.hadoop.hive.ql.lib.NodeProcessor; import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx; @@ -48,10 +50,12 @@ import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc; import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc; +import org.apache.hadoop.hive.ql.plan.GroupByDesc; import org.apache.hadoop.hive.ql.plan.JoinDesc; import org.apache.hadoop.hive.ql.plan.OperatorDesc; import org.apache.hadoop.hive.ql.plan.Statistics; import org.apache.hadoop.hive.ql.stats.StatsUtils; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator; import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPAnd; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqual; @@ -66,7 +70,9 @@ import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPNull; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPOr; import org.apache.hadoop.hive.serde.serdeConstants; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; +import java.lang.reflect.Field; import java.util.Collections; import java.util.HashMap; import java.util.List; @@ -579,10 +585,16 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, GroupByOperator gop = (GroupByOperator) nd; Operator parent = gop.getParentOperators().get(0); Statistics parentStats = parent.getStatistics(); + // parent stats are not populated yet + if (parentStats == null) { + return null; + } + AnnotateStatsProcCtx aspCtx = (AnnotateStatsProcCtx) procCtx; HiveConf conf = aspCtx.getConf(); - int mapSideParallelism = - HiveConf.getIntVar(conf, HiveConf.ConfVars.HIVE_STATS_MAP_SIDE_PARALLELISM); + long maxSplitSize = HiveConf.getLongVar(conf, + HiveConf.ConfVars.MAPREDMAXSPLITSIZE); + int mapSideParallelism = 1; List aggDesc = gop.getConf().getAggregators(); Map colExprMap = gop.getColumnExprMap(); RowSchema rs = gop.getSchema(); @@ -590,13 +602,33 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, boolean mapSide = false; int multiplier = mapSideParallelism; long newNumRows; - long newDataSize; + List colStats = + StatsUtils.getColStatisticsFromExprMap(conf, parentStats, colExprMap, rs); + boolean mapSideAgg = false; // map side if (gop.getChildOperators().get(0) instanceof ReduceSinkOperator || gop.getChildOperators().get(0) instanceof AppMasterEventOperator) { - mapSide = true; + mapSide = true; + + // consider approximate map side parallelism to be table data size + // divided by max split size + TableScanOperator top = OperatorUtils.findSingleOperatorUpstream(gop, + TableScanOperator.class); + final long inputSize; + // if top is null then there are multiple parents (RS as well), hence + // lets use parent statistics to get data size. Also maxSplitSize should + // be updated to bytes per reducer (1GB default) + if (top == null) { + inputSize = parentStats.getDataSize(); + maxSplitSize = HiveConf.getLongVar(conf, + HiveConf.ConfVars.BYTESPERREDUCER); + } else { + inputSize = top.getConf().getStatistics().getDataSize(); + } + mapSideParallelism = (int) Math.ceil((double) inputSize / maxSplitSize); + multiplier = mapSideParallelism; // map-side grouping set present. if grouping set is present then // multiply the number of rows by number of elements in grouping set @@ -607,10 +639,8 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, try { if (satisfyPrecondition(parentStats)) { + mapSideAgg = checkMapSideAggregation(gop, colStats, conf); stats = parentStats.clone(); - - List colStats = - StatsUtils.getColStatisticsFromExprMap(conf, parentStats, colExprMap, rs); stats.setColumnStats(colStats); long dvProd = 1; @@ -639,58 +669,29 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, } } - // map side - if (mapSide) { - - // since we do not know if hash-aggregation will be enabled or disabled - // at runtime we will assume that map-side group by does not do any - // reduction.hence no group by rule will be applied - - // map-side grouping set present. if grouping set is present then - // multiply the number of rows by number of elements in grouping set - if (gop.getConf().isGroupingSetsPresent()) { - newNumRows = setMaxIfInvalid(multiplier * stats.getNumRows()); - newDataSize = setMaxIfInvalid(multiplier * stats.getDataSize()); - stats.setNumRows(newNumRows); - stats.setDataSize(newDataSize); - for (ColStatistics cs : colStats) { - if (cs != null) { - long oldNumNulls = cs.getNumNulls(); - long newNumNulls = multiplier * oldNumNulls; - cs.setNumNulls(newNumNulls); - } - } - } else { - - // map side no grouping set - newNumRows = stats.getNumRows() * multiplier; - updateStats(stats, newNumRows, true, gop); - } - } else { - - // reduce side + // apply GBY rule when map aggregation is enabled or if it is reduce + // side group by + newNumRows = setMaxIfInvalid(multiplier * stats.getNumRows()); + if (mapSideAgg || !mapSide) { newNumRows = applyGBYRule(stats.getNumRows(), dvProd); - updateStats(stats, newNumRows, true, gop); } + + // update stats, but don't update NDV as it will not change + updateStats(stats, newNumRows, true, gop, false); } else { if (parentStats != null) { stats = parentStats.clone(); - // worst case, in the absence of column statistics assume half the rows are emitted + // worst case, in the absence of column statistics assume hash + // aggregation is disabled and reduce side only reduces half the + // number of rows if (mapSide) { - - // map side newNumRows = multiplier * stats.getNumRows(); - newDataSize = multiplier * stats.getDataSize(); - stats.setNumRows(newNumRows); - stats.setDataSize(newDataSize); } else { - - // reduce side newNumRows = parentStats.getNumRows() / 2; - updateStats(stats, newNumRows, false, gop); } + updateStats(stats, newNumRows, false, gop); } } @@ -747,6 +748,107 @@ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, return null; } + /** + * This method does not take into account many configs used at runtime to + * disable hash aggregation like HIVEMAPAGGRHASHMINREDUCTION. This method + * roughly estimates the number of rows and size of each row to see if it + * can fit in hashtable for aggregation. + * @param gop - group by operator + * @param colStats - column stats for key columns + * @param conf - hive conf + * @return + */ + private boolean checkMapSideAggregation(GroupByOperator gop, + List colStats, HiveConf conf) { + + List aggDesc = gop.getConf().getAggregators(); + GroupByDesc desc = gop.getConf(); + GroupByDesc.Mode mode = desc.getMode(); + + if (mode.equals(GroupByDesc.Mode.HASH)) { + float hashAggMem = conf.getFloatVar( + HiveConf.ConfVars.HIVEMAPAGGRHASHMEMORY); + float hashAggMaxThreshold = conf.getFloatVar( + HiveConf.ConfVars.HIVEMAPAGGRMEMORYTHRESHOLD); + + // get memory for container. May be use mapreduce.map.java.opts instead? + long totalMemory = + DagUtils.getContainerResource(conf).getMemory() * 1000 * 1000; + long maxMemHashAgg = Math + .round(totalMemory * hashAggMem * hashAggMaxThreshold); + + // estimated number of rows will be product of NDVs + long numEstimatedRows = 1; + + // estimate size of key from column statistics + long avgKeySize = 0; + for (ColStatistics cs : colStats) { + if (cs != null) { + numEstimatedRows *= cs.getCountDistint(); + avgKeySize += Math.ceil(cs.getAvgColLen()); + } + } + + // average value size will be sum of all sizes of aggregation buffers + long avgValSize = 0; + // go over all aggregation buffers and see they implement estimable + // interface if so they aggregate the size of the aggregation buffer + GenericUDAFEvaluator[] aggregationEvaluators; + aggregationEvaluators = new GenericUDAFEvaluator[aggDesc.size()]; + + // get aggregation evaluators + for (int i = 0; i < aggregationEvaluators.length; i++) { + AggregationDesc agg = aggDesc.get(i); + aggregationEvaluators[i] = agg.getGenericUDAFEvaluator(); + } + + // estimate size of aggregation buffer + for (int i = 0; i < aggregationEvaluators.length; i++) { + + // each evaluator has constant java object overhead + avgValSize += gop.javaObjectOverHead; + GenericUDAFEvaluator.AggregationBuffer agg = null; + try { + agg = aggregationEvaluators[i].getNewAggregationBuffer(); + } catch (HiveException e) { + // in case of exception assume unknown type (256 bytes) + avgValSize += gop.javaSizeUnknownType; + } + + // aggregate size from aggregation buffers + if (agg != null) { + if (GenericUDAFEvaluator.isEstimable(agg)) { + avgValSize += ((GenericUDAFEvaluator.AbstractAggregationBuffer) agg) + .estimate(); + } else { + // if the aggregation buffer is not estimable then get all the + // declared fields and compute the sizes from field types + Field[] fArr = ObjectInspectorUtils + .getDeclaredNonStaticFields(agg.getClass()); + for (Field f : fArr) { + long avgSize = StatsUtils + .getAvgColLenOfFixedLengthTypes(f.getType().getName()); + avgValSize += avgSize == 0 ? gop.javaSizeUnknownType : avgSize; + } + } + } + } + + // total size of each hash entry + long hashEntrySize = gop.javaHashEntryOverHead + avgKeySize + avgValSize; + + // estimated hash table size + long estHashTableSize = numEstimatedRows * hashEntrySize; + + if (estHashTableSize < maxMemHashAgg) { + return true; + } + } + + // worst-case, hash aggregation disabled + return false; + } + private long applyGBYRule(long numRows, long dvProd) { long newNumRows = numRows; @@ -1378,6 +1480,7 @@ public static NodeProcessor getDefaultRule() { return new DefaultStatsRule(); } + /** * Update the basic statistics of the statistics object based on the row number * @param stats @@ -1389,6 +1492,12 @@ public static NodeProcessor getDefaultRule() { */ static void updateStats(Statistics stats, long newNumRows, boolean useColStats, Operator op) { + updateStats(stats, newNumRows, useColStats, op, true); + } + + static void updateStats(Statistics stats, long newNumRows, + boolean useColStats, Operator op, + boolean updateNDV) { if (newNumRows <= 0) { LOG.info("STATS-" + op.toString() + ": Overflow in number of rows." @@ -1406,17 +1515,19 @@ static void updateStats(Statistics stats, long newNumRows, long oldNumNulls = cs.getNumNulls(); long oldDV = cs.getCountDistint(); long newNumNulls = Math.round(ratio * oldNumNulls); - long newDV = oldDV; - - // if ratio is greater than 1, then number of rows increases. This can happen - // when some operators like GROUPBY duplicates the input rows in which case - // number of distincts should not change. Update the distinct count only when - // the output number of rows is less than input number of rows. - if (ratio <= 1.0) { - newDV = (long) Math.ceil(ratio * oldDV); - } cs.setNumNulls(newNumNulls); - cs.setCountDistint(newDV); + if (updateNDV) { + long newDV = oldDV; + + // if ratio is greater than 1, then number of rows increases. This can happen + // when some operators like GROUPBY duplicates the input rows in which case + // number of distincts should not change. Update the distinct count only when + // the output number of rows is less than input number of rows. + if (ratio <= 1.0) { + newDV = (long) Math.ceil(ratio * oldDV); + } + cs.setCountDistint(newDV); + } } stats.setColumnStats(colStats); long newDataSize = StatsUtils.getDataSizeFromColumnStats(newNumRows, colStats); diff --git ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java index b51f7a8..1e610b7 100644 --- ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java +++ ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java @@ -751,7 +751,8 @@ public static long getAvgColLenOfFixedLengthTypes(String colType) { || colType.equalsIgnoreCase(serdeConstants.FLOAT_TYPE_NAME)) { return JavaDataModel.get().primitive1(); } else if (colType.equalsIgnoreCase(serdeConstants.DOUBLE_TYPE_NAME) - || colType.equalsIgnoreCase(serdeConstants.BIGINT_TYPE_NAME)) { + || colType.equalsIgnoreCase(serdeConstants.BIGINT_TYPE_NAME) + || colType.equalsIgnoreCase("long")) { return JavaDataModel.get().primitive2(); } else if (colType.equalsIgnoreCase(serdeConstants.TIMESTAMP_TYPE_NAME)) { return JavaDataModel.get().lengthOfTimestamp(); @@ -780,7 +781,8 @@ public static long getSizeOfPrimitiveTypeArraysFromType(String colType, int leng return JavaDataModel.get().lengthForIntArrayOfSize(length); } else if (colType.equalsIgnoreCase(serdeConstants.DOUBLE_TYPE_NAME)) { return JavaDataModel.get().lengthForDoubleArrayOfSize(length); - } else if (colType.equalsIgnoreCase(serdeConstants.BIGINT_TYPE_NAME)) { + } else if (colType.equalsIgnoreCase(serdeConstants.BIGINT_TYPE_NAME) + || colType.equalsIgnoreCase("long")) { return JavaDataModel.get().lengthForLongArrayOfSize(length); } else if (colType.equalsIgnoreCase(serdeConstants.BINARY_TYPE_NAME)) { return JavaDataModel.get().lengthForByteArrayOfSize(length); diff --git ql/src/test/queries/clientpositive/annotate_stats_groupby.q ql/src/test/queries/clientpositive/annotate_stats_groupby.q index 1c0829d..d975d3b 100644 --- ql/src/test/queries/clientpositive/annotate_stats_groupby.q +++ ql/src/test/queries/clientpositive/annotate_stats_groupby.q @@ -1,4 +1,7 @@ set hive.stats.fetch.column.stats=true; +set hive.map.aggr.hash.percentmemory=0.0f; + +-- hash aggregation is disabled create table if not exists loc_staging ( state string, @@ -58,7 +61,8 @@ explain select state,locid from loc_orc group by state,locid grouping sets((stat -- map-side GBY numRows: 32 reduce-side GBY numRows: 16 explain select state,locid from loc_orc group by state,locid grouping sets((state,locid),(state),(locid),()); -set hive.stats.map.parallelism=10; +set mapred.max.split.size=80; +-- map-side parallelism will be 10 -- map-side GBY: numRows: 80 (map-side will not do any reduction) -- reduce-side GBY: numRows: 2 Reason: numDistinct of year is 2. numRows = min(80/2, 2) @@ -67,8 +71,8 @@ explain select year from loc_orc group by year; -- map-side GBY numRows: 320 reduce-side GBY numRows: 42 Reason: numDistinct of state and locid are 6,7 resp. numRows = min(320/2, 6*7) explain select state,locid from loc_orc group by state,locid with cube; +set mapred.max.split.size=1000; set hive.stats.fetch.column.stats=false; -set hive.stats.map.parallelism=1; -- map-side GBY numRows: 32 reduce-side GBY numRows: 16 explain select state,locid from loc_orc group by state,locid with cube; @@ -88,7 +92,7 @@ explain select state,locid from loc_orc group by state,locid grouping sets((stat -- map-side GBY numRows: 32 reduce-side GBY numRows: 16 explain select state,locid from loc_orc group by state,locid grouping sets((state,locid),(state),(locid),()); -set hive.stats.map.parallelism=10; +set mapred.max.split.size=80; -- map-side GBY: numRows: 80 (map-side will not do any reduction) -- reduce-side GBY: numRows: 2 Reason: numDistinct of year is 2. numRows = min(80/2, 2) diff --git ql/src/test/queries/clientpositive/annotate_stats_groupby_hashagg.q ql/src/test/queries/clientpositive/annotate_stats_groupby_hashagg.q new file mode 100644 index 0000000..b770261 --- /dev/null +++ ql/src/test/queries/clientpositive/annotate_stats_groupby_hashagg.q @@ -0,0 +1,102 @@ +set hive.stats.fetch.column.stats=true; +set hive.map.aggr.hash.percentmemory=0.5f; + +-- hash aggregation is disabled + +create table if not exists loc_staging ( + state string, + locid int, + zip bigint, + year int +) row format delimited fields terminated by '|' stored as textfile; + +create table loc_orc like loc_staging; +alter table loc_orc set fileformat orc; + +load data local inpath '../../data/files/loc.txt' overwrite into table loc_staging; + +insert overwrite table loc_orc select * from loc_staging; + +-- numRows: 8 rawDataSize: 796 +explain select * from loc_orc; + +-- partial column stats +analyze table loc_orc compute statistics for columns state; + +-- inner group by: map - numRows: 8 reduce - numRows: 4 +-- outer group by: map - numRows: 4 reduce numRows: 2 +explain select a, c, min(b) +from ( select state as a, locid as b, count(*) as c + from loc_orc + group by state,locid + ) sq1 +group by a,c; + +analyze table loc_orc compute statistics for columns state,locid,zip,year; + +-- only one distinct value in year column + 1 NULL value +-- map-side GBY: numRows: 2 +-- reduce-side GBY: numRows: 1 +explain select year from loc_orc group by year; + +-- map-side GBY: numRows: 4 +-- reduce-side GBY: numRows: 2 +explain select state,locid from loc_orc group by state,locid; + +-- map-side GBY numRows: 4 reduce-side GBY numRows: 2 +explain select state,locid from loc_orc group by state,locid with cube; + +-- map-side GBY numRows: 4 reduce-side GBY numRows: 2 +explain select state,locid from loc_orc group by state,locid with rollup; + +-- map-side GBY numRows: 4 reduce-side GBY numRows: 2 +explain select state,locid from loc_orc group by state,locid grouping sets((state)); + +-- map-side GBY numRows: 4 reduce-side GBY numRows: 2 +explain select state,locid from loc_orc group by state,locid grouping sets((state),(locid)); + +-- map-side GBY numRows: 4 reduce-side GBY numRows: 2 +explain select state,locid from loc_orc group by state,locid grouping sets((state),(locid),()); + +-- map-side GBY numRows: 4 reduce-side GBY numRows: 2 +explain select state,locid from loc_orc group by state,locid grouping sets((state,locid),(state),(locid),()); + +set mapred.max.split.size=80; + +-- map-side GBY: numRows: 2 +-- reduce-side GBY: numRows: 1 +explain select year from loc_orc group by year; + +-- map-side GBY numRows: 4 reduce-side GBY numRows: 2 +explain select state,locid from loc_orc group by state,locid with cube; + +set mapred.max.split.size=1000; +set hive.stats.fetch.column.stats=false; + +-- map-side GBY numRows: 32 reduce-side GBY numRows: 16 +explain select state,locid from loc_orc group by state,locid with cube; + +-- map-side GBY numRows: 24 reduce-side GBY numRows: 12 +explain select state,locid from loc_orc group by state,locid with rollup; + +-- map-side GBY numRows: 8 reduce-side GBY numRows: 4 +explain select state,locid from loc_orc group by state,locid grouping sets((state)); + +-- map-side GBY numRows: 16 reduce-side GBY numRows: 8 +explain select state,locid from loc_orc group by state,locid grouping sets((state),(locid)); + +-- map-side GBY numRows: 24 reduce-side GBY numRows: 12 +explain select state,locid from loc_orc group by state,locid grouping sets((state),(locid),()); + +-- map-side GBY numRows: 32 reduce-side GBY numRows: 16 +explain select state,locid from loc_orc group by state,locid grouping sets((state,locid),(state),(locid),()); + +set mapred.max.split.size=80; + +-- map-side GBY: numRows: 80 +-- reduce-side GBY: numRows: 40 +explain select year from loc_orc group by year; + +-- map-side GBY numRows: 320 reduce-side GBY numRows: 160 +explain select state,locid from loc_orc group by state,locid with cube; + diff --git ql/src/test/results/clientpositive/annotate_stats_groupby.q.out ql/src/test/results/clientpositive/annotate_stats_groupby.q.out index 871c4217..eb0ef2a 100644 --- ql/src/test/results/clientpositive/annotate_stats_groupby.q.out +++ ql/src/test/results/clientpositive/annotate_stats_groupby.q.out @@ -1,4 +1,6 @@ -PREHOOK: query: create table if not exists loc_staging ( +PREHOOK: query: -- hash aggregation is disabled + +create table if not exists loc_staging ( state string, locid int, zip bigint, @@ -7,7 +9,9 @@ PREHOOK: query: create table if not exists loc_staging ( PREHOOK: type: CREATETABLE PREHOOK: Output: database:default PREHOOK: Output: default@loc_staging -POSTHOOK: query: create table if not exists loc_staging ( +POSTHOOK: query: -- hash aggregation is disabled + +create table if not exists loc_staging ( state string, locid int, zip bigint, @@ -339,12 +343,12 @@ STAGE PLANS: keys: state (type: string), locid (type: int), '0' (type: string) mode: hash outputColumnNames: _col0, _col1, _col2 - Statistics: Num rows: 32 Data size: 3184 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 32 Data size: 5600 Basic stats: COMPLETE Column stats: COMPLETE Reduce Output Operator key expressions: _col0 (type: string), _col1 (type: int), _col2 (type: string) sort order: +++ Map-reduce partition columns: _col0 (type: string), _col1 (type: int), _col2 (type: string) - Statistics: Num rows: 32 Data size: 3184 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 32 Data size: 5600 Basic stats: COMPLETE Column stats: COMPLETE Reduce Operator Tree: Group By Operator keys: KEY._col0 (type: string), KEY._col1 (type: int), KEY._col2 (type: string) @@ -394,12 +398,12 @@ STAGE PLANS: keys: state (type: string), locid (type: int), '0' (type: string) mode: hash outputColumnNames: _col0, _col1, _col2 - Statistics: Num rows: 24 Data size: 2388 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 24 Data size: 4200 Basic stats: COMPLETE Column stats: COMPLETE Reduce Output Operator key expressions: _col0 (type: string), _col1 (type: int), _col2 (type: string) sort order: +++ Map-reduce partition columns: _col0 (type: string), _col1 (type: int), _col2 (type: string) - Statistics: Num rows: 24 Data size: 2388 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 24 Data size: 4200 Basic stats: COMPLETE Column stats: COMPLETE Reduce Operator Tree: Group By Operator keys: KEY._col0 (type: string), KEY._col1 (type: int), KEY._col2 (type: string) @@ -449,12 +453,12 @@ STAGE PLANS: keys: state (type: string), locid (type: int), '0' (type: string) mode: hash outputColumnNames: _col0, _col1, _col2 - Statistics: Num rows: 8 Data size: 796 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 8 Data size: 1400 Basic stats: COMPLETE Column stats: COMPLETE Reduce Output Operator key expressions: _col0 (type: string), _col1 (type: int), _col2 (type: string) sort order: +++ Map-reduce partition columns: _col0 (type: string), _col1 (type: int), _col2 (type: string) - Statistics: Num rows: 8 Data size: 796 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 8 Data size: 1400 Basic stats: COMPLETE Column stats: COMPLETE Reduce Operator Tree: Group By Operator keys: KEY._col0 (type: string), KEY._col1 (type: int), KEY._col2 (type: string) @@ -504,12 +508,12 @@ STAGE PLANS: keys: state (type: string), locid (type: int), '0' (type: string) mode: hash outputColumnNames: _col0, _col1, _col2 - Statistics: Num rows: 16 Data size: 1592 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 16 Data size: 2800 Basic stats: COMPLETE Column stats: COMPLETE Reduce Output Operator key expressions: _col0 (type: string), _col1 (type: int), _col2 (type: string) sort order: +++ Map-reduce partition columns: _col0 (type: string), _col1 (type: int), _col2 (type: string) - Statistics: Num rows: 16 Data size: 1592 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 16 Data size: 2800 Basic stats: COMPLETE Column stats: COMPLETE Reduce Operator Tree: Group By Operator keys: KEY._col0 (type: string), KEY._col1 (type: int), KEY._col2 (type: string) @@ -559,12 +563,12 @@ STAGE PLANS: keys: state (type: string), locid (type: int), '0' (type: string) mode: hash outputColumnNames: _col0, _col1, _col2 - Statistics: Num rows: 24 Data size: 2388 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 24 Data size: 4200 Basic stats: COMPLETE Column stats: COMPLETE Reduce Output Operator key expressions: _col0 (type: string), _col1 (type: int), _col2 (type: string) sort order: +++ Map-reduce partition columns: _col0 (type: string), _col1 (type: int), _col2 (type: string) - Statistics: Num rows: 24 Data size: 2388 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 24 Data size: 4200 Basic stats: COMPLETE Column stats: COMPLETE Reduce Operator Tree: Group By Operator keys: KEY._col0 (type: string), KEY._col1 (type: int), KEY._col2 (type: string) @@ -614,12 +618,12 @@ STAGE PLANS: keys: state (type: string), locid (type: int), '0' (type: string) mode: hash outputColumnNames: _col0, _col1, _col2 - Statistics: Num rows: 32 Data size: 3184 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 32 Data size: 5600 Basic stats: COMPLETE Column stats: COMPLETE Reduce Output Operator key expressions: _col0 (type: string), _col1 (type: int), _col2 (type: string) sort order: +++ Map-reduce partition columns: _col0 (type: string), _col1 (type: int), _col2 (type: string) - Statistics: Num rows: 32 Data size: 3184 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 32 Data size: 5600 Basic stats: COMPLETE Column stats: COMPLETE Reduce Operator Tree: Group By Operator keys: KEY._col0 (type: string), KEY._col1 (type: int), KEY._col2 (type: string) @@ -644,11 +648,15 @@ STAGE PLANS: Processor Tree: ListSink -PREHOOK: query: -- map-side GBY: numRows: 80 (map-side will not do any reduction) +PREHOOK: query: -- map-side parallelism will be 10 + +-- map-side GBY: numRows: 80 (map-side will not do any reduction) -- reduce-side GBY: numRows: 2 Reason: numDistinct of year is 2. numRows = min(80/2, 2) explain select year from loc_orc group by year PREHOOK: type: QUERY -POSTHOOK: query: -- map-side GBY: numRows: 80 (map-side will not do any reduction) +POSTHOOK: query: -- map-side parallelism will be 10 + +-- map-side GBY: numRows: 80 (map-side will not do any reduction) -- reduce-side GBY: numRows: 2 Reason: numDistinct of year is 2. numRows = min(80/2, 2) explain select year from loc_orc group by year POSTHOOK: type: QUERY @@ -726,12 +734,12 @@ STAGE PLANS: keys: state (type: string), locid (type: int), '0' (type: string) mode: hash outputColumnNames: _col0, _col1, _col2 - Statistics: Num rows: 320 Data size: 31840 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 320 Data size: 56000 Basic stats: COMPLETE Column stats: COMPLETE Reduce Output Operator key expressions: _col0 (type: string), _col1 (type: int), _col2 (type: string) sort order: +++ Map-reduce partition columns: _col0 (type: string), _col1 (type: int), _col2 (type: string) - Statistics: Num rows: 320 Data size: 31840 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 320 Data size: 56000 Basic stats: COMPLETE Column stats: COMPLETE Reduce Operator Tree: Group By Operator keys: KEY._col0 (type: string), KEY._col1 (type: int), KEY._col2 (type: string) diff --git ql/src/test/results/clientpositive/annotate_stats_groupby_hashagg.q.out ql/src/test/results/clientpositive/annotate_stats_groupby_hashagg.q.out new file mode 100644 index 0000000..e121a1e --- /dev/null +++ ql/src/test/results/clientpositive/annotate_stats_groupby_hashagg.q.out @@ -0,0 +1,1204 @@ +PREHOOK: query: -- hash aggregation is disabled + +create table if not exists loc_staging ( + state string, + locid int, + zip bigint, + year int +) row format delimited fields terminated by '|' stored as textfile +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@loc_staging +POSTHOOK: query: -- hash aggregation is disabled + +create table if not exists loc_staging ( + state string, + locid int, + zip bigint, + year int +) row format delimited fields terminated by '|' stored as textfile +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@loc_staging +PREHOOK: query: create table loc_orc like loc_staging +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@loc_orc +POSTHOOK: query: create table loc_orc like loc_staging +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@loc_orc +PREHOOK: query: alter table loc_orc set fileformat orc +PREHOOK: type: ALTERTABLE_FILEFORMAT +PREHOOK: Input: default@loc_orc +PREHOOK: Output: default@loc_orc +POSTHOOK: query: alter table loc_orc set fileformat orc +POSTHOOK: type: ALTERTABLE_FILEFORMAT +POSTHOOK: Input: default@loc_orc +POSTHOOK: Output: default@loc_orc +PREHOOK: query: load data local inpath '../../data/files/loc.txt' overwrite into table loc_staging +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@loc_staging +POSTHOOK: query: load data local inpath '../../data/files/loc.txt' overwrite into table loc_staging +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@loc_staging +PREHOOK: query: insert overwrite table loc_orc select * from loc_staging +PREHOOK: type: QUERY +PREHOOK: Input: default@loc_staging +PREHOOK: Output: default@loc_orc +POSTHOOK: query: insert overwrite table loc_orc select * from loc_staging +POSTHOOK: type: QUERY +POSTHOOK: Input: default@loc_staging +POSTHOOK: Output: default@loc_orc +POSTHOOK: Lineage: loc_orc.locid SIMPLE [(loc_staging)loc_staging.FieldSchema(name:locid, type:int, comment:null), ] +POSTHOOK: Lineage: loc_orc.state SIMPLE [(loc_staging)loc_staging.FieldSchema(name:state, type:string, comment:null), ] +POSTHOOK: Lineage: loc_orc.year SIMPLE [(loc_staging)loc_staging.FieldSchema(name:year, type:int, comment:null), ] +POSTHOOK: Lineage: loc_orc.zip SIMPLE [(loc_staging)loc_staging.FieldSchema(name:zip, type:bigint, comment:null), ] +PREHOOK: query: -- numRows: 8 rawDataSize: 796 +explain select * from loc_orc +PREHOOK: type: QUERY +POSTHOOK: query: -- numRows: 8 rawDataSize: 796 +explain select * from loc_orc +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + TableScan + alias: loc_orc + Statistics: Num rows: 8 Data size: 796 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: state (type: string), locid (type: int), zip (type: bigint), year (type: int) + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 8 Data size: 796 Basic stats: COMPLETE Column stats: NONE + ListSink + +PREHOOK: query: -- partial column stats +analyze table loc_orc compute statistics for columns state +PREHOOK: type: QUERY +PREHOOK: Input: default@loc_orc +#### A masked pattern was here #### +POSTHOOK: query: -- partial column stats +analyze table loc_orc compute statistics for columns state +POSTHOOK: type: QUERY +POSTHOOK: Input: default@loc_orc +#### A masked pattern was here #### +PREHOOK: query: -- inner group by: map - numRows: 8 reduce - numRows: 4 +-- outer group by: map - numRows: 4 reduce numRows: 2 +explain select a, c, min(b) +from ( select state as a, locid as b, count(*) as c + from loc_orc + group by state,locid + ) sq1 +group by a,c +PREHOOK: type: QUERY +POSTHOOK: query: -- inner group by: map - numRows: 8 reduce - numRows: 4 +-- outer group by: map - numRows: 4 reduce numRows: 2 +explain select a, c, min(b) +from ( select state as a, locid as b, count(*) as c + from loc_orc + group by state,locid + ) sq1 +group by a,c +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: loc_orc + Statistics: Num rows: 8 Data size: 796 Basic stats: COMPLETE Column stats: PARTIAL + Select Operator + expressions: state (type: string), locid (type: int) + outputColumnNames: state, locid + Statistics: Num rows: 8 Data size: 796 Basic stats: COMPLETE Column stats: PARTIAL + Group By Operator + aggregations: count() + keys: state (type: string), locid (type: int) + mode: hash + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 4 Data size: 376 Basic stats: COMPLETE Column stats: PARTIAL + Reduce Output Operator + key expressions: _col0 (type: string), _col1 (type: int) + sort order: ++ + Map-reduce partition columns: _col0 (type: string), _col1 (type: int) + Statistics: Num rows: 4 Data size: 376 Basic stats: COMPLETE Column stats: PARTIAL + value expressions: _col2 (type: bigint) + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + keys: KEY._col0 (type: string), KEY._col1 (type: int) + mode: mergepartial + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 2 Data size: 200 Basic stats: COMPLETE Column stats: PARTIAL + Select Operator + expressions: _col0 (type: string), _col1 (type: int), _col2 (type: bigint) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 2 Data size: 200 Basic stats: COMPLETE Column stats: PARTIAL + Group By Operator + aggregations: min(_col1) + keys: _col0 (type: string), _col2 (type: bigint) + mode: hash + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 104 Basic stats: COMPLETE Column stats: PARTIAL + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + + Stage: Stage-2 + Map Reduce + Map Operator Tree: + TableScan + Reduce Output Operator + key expressions: _col0 (type: string), _col1 (type: bigint) + sort order: ++ + Map-reduce partition columns: _col0 (type: string), _col1 (type: bigint) + Statistics: Num rows: 1 Data size: 104 Basic stats: COMPLETE Column stats: PARTIAL + value expressions: _col2 (type: int) + Reduce Operator Tree: + Group By Operator + aggregations: min(VALUE._col0) + keys: KEY._col0 (type: string), KEY._col1 (type: bigint) + mode: mergepartial + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 104 Basic stats: COMPLETE Column stats: PARTIAL + Select Operator + expressions: _col0 (type: string), _col1 (type: bigint), _col2 (type: int) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 104 Basic stats: COMPLETE Column stats: PARTIAL + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 104 Basic stats: COMPLETE Column stats: PARTIAL + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: analyze table loc_orc compute statistics for columns state,locid,zip,year +PREHOOK: type: QUERY +PREHOOK: Input: default@loc_orc +#### A masked pattern was here #### +POSTHOOK: query: analyze table loc_orc compute statistics for columns state,locid,zip,year +POSTHOOK: type: QUERY +POSTHOOK: Input: default@loc_orc +#### A masked pattern was here #### +PREHOOK: query: -- only one distinct value in year column + 1 NULL value +-- map-side GBY: numRows: 2 +-- reduce-side GBY: numRows: 1 +explain select year from loc_orc group by year +PREHOOK: type: QUERY +POSTHOOK: query: -- only one distinct value in year column + 1 NULL value +-- map-side GBY: numRows: 2 +-- reduce-side GBY: numRows: 1 +explain select year from loc_orc group by year +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: loc_orc + Statistics: Num rows: 8 Data size: 796 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: year (type: int) + outputColumnNames: year + Statistics: Num rows: 8 Data size: 796 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + keys: year (type: int) + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 2 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: int) + sort order: + + Map-reduce partition columns: _col0 (type: int) + Statistics: Num rows: 2 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Operator Tree: + Group By Operator + keys: KEY._col0 (type: int) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: _col0 (type: int) + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: -- map-side GBY: numRows: 4 +-- reduce-side GBY: numRows: 2 +explain select state,locid from loc_orc group by state,locid +PREHOOK: type: QUERY +POSTHOOK: query: -- map-side GBY: numRows: 4 +-- reduce-side GBY: numRows: 2 +explain select state,locid from loc_orc group by state,locid +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: loc_orc + Statistics: Num rows: 8 Data size: 796 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: state (type: string), locid (type: int) + outputColumnNames: state, locid + Statistics: Num rows: 8 Data size: 796 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + keys: state (type: string), locid (type: int) + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 4 Data size: 360 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: string), _col1 (type: int) + sort order: ++ + Map-reduce partition columns: _col0 (type: string), _col1 (type: int) + Statistics: Num rows: 4 Data size: 360 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Operator Tree: + Group By Operator + keys: KEY._col0 (type: string), KEY._col1 (type: int) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 2 Data size: 180 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: _col0 (type: string), _col1 (type: int) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 2 Data size: 180 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 2 Data size: 180 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: -- map-side GBY numRows: 4 reduce-side GBY numRows: 2 +explain select state,locid from loc_orc group by state,locid with cube +PREHOOK: type: QUERY +POSTHOOK: query: -- map-side GBY numRows: 4 reduce-side GBY numRows: 2 +explain select state,locid from loc_orc group by state,locid with cube +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: loc_orc + Statistics: Num rows: 8 Data size: 796 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: state (type: string), locid (type: int) + outputColumnNames: state, locid + Statistics: Num rows: 8 Data size: 796 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + keys: state (type: string), locid (type: int), '0' (type: string) + mode: hash + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 4 Data size: 700 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: string), _col1 (type: int), _col2 (type: string) + sort order: +++ + Map-reduce partition columns: _col0 (type: string), _col1 (type: int), _col2 (type: string) + Statistics: Num rows: 4 Data size: 700 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Operator Tree: + Group By Operator + keys: KEY._col0 (type: string), KEY._col1 (type: int), KEY._col2 (type: string) + mode: mergepartial + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 2 Data size: 350 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: _col0 (type: string), _col1 (type: int) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 2 Data size: 180 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 2 Data size: 180 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: -- map-side GBY numRows: 4 reduce-side GBY numRows: 2 +explain select state,locid from loc_orc group by state,locid with rollup +PREHOOK: type: QUERY +POSTHOOK: query: -- map-side GBY numRows: 4 reduce-side GBY numRows: 2 +explain select state,locid from loc_orc group by state,locid with rollup +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: loc_orc + Statistics: Num rows: 8 Data size: 796 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: state (type: string), locid (type: int) + outputColumnNames: state, locid + Statistics: Num rows: 8 Data size: 796 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + keys: state (type: string), locid (type: int), '0' (type: string) + mode: hash + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 4 Data size: 700 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: string), _col1 (type: int), _col2 (type: string) + sort order: +++ + Map-reduce partition columns: _col0 (type: string), _col1 (type: int), _col2 (type: string) + Statistics: Num rows: 4 Data size: 700 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Operator Tree: + Group By Operator + keys: KEY._col0 (type: string), KEY._col1 (type: int), KEY._col2 (type: string) + mode: mergepartial + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 2 Data size: 350 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: _col0 (type: string), _col1 (type: int) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 2 Data size: 180 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 2 Data size: 180 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: -- map-side GBY numRows: 4 reduce-side GBY numRows: 2 +explain select state,locid from loc_orc group by state,locid grouping sets((state)) +PREHOOK: type: QUERY +POSTHOOK: query: -- map-side GBY numRows: 4 reduce-side GBY numRows: 2 +explain select state,locid from loc_orc group by state,locid grouping sets((state)) +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: loc_orc + Statistics: Num rows: 8 Data size: 796 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: state (type: string), locid (type: int) + outputColumnNames: state, locid + Statistics: Num rows: 8 Data size: 796 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + keys: state (type: string), locid (type: int), '0' (type: string) + mode: hash + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 4 Data size: 700 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: string), _col1 (type: int), _col2 (type: string) + sort order: +++ + Map-reduce partition columns: _col0 (type: string), _col1 (type: int), _col2 (type: string) + Statistics: Num rows: 4 Data size: 700 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Operator Tree: + Group By Operator + keys: KEY._col0 (type: string), KEY._col1 (type: int), KEY._col2 (type: string) + mode: mergepartial + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 2 Data size: 350 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: _col0 (type: string), _col1 (type: int) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 2 Data size: 180 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 2 Data size: 180 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: -- map-side GBY numRows: 4 reduce-side GBY numRows: 2 +explain select state,locid from loc_orc group by state,locid grouping sets((state),(locid)) +PREHOOK: type: QUERY +POSTHOOK: query: -- map-side GBY numRows: 4 reduce-side GBY numRows: 2 +explain select state,locid from loc_orc group by state,locid grouping sets((state),(locid)) +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: loc_orc + Statistics: Num rows: 8 Data size: 796 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: state (type: string), locid (type: int) + outputColumnNames: state, locid + Statistics: Num rows: 8 Data size: 796 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + keys: state (type: string), locid (type: int), '0' (type: string) + mode: hash + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 4 Data size: 700 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: string), _col1 (type: int), _col2 (type: string) + sort order: +++ + Map-reduce partition columns: _col0 (type: string), _col1 (type: int), _col2 (type: string) + Statistics: Num rows: 4 Data size: 700 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Operator Tree: + Group By Operator + keys: KEY._col0 (type: string), KEY._col1 (type: int), KEY._col2 (type: string) + mode: mergepartial + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 2 Data size: 350 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: _col0 (type: string), _col1 (type: int) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 2 Data size: 180 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 2 Data size: 180 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: -- map-side GBY numRows: 4 reduce-side GBY numRows: 2 +explain select state,locid from loc_orc group by state,locid grouping sets((state),(locid),()) +PREHOOK: type: QUERY +POSTHOOK: query: -- map-side GBY numRows: 4 reduce-side GBY numRows: 2 +explain select state,locid from loc_orc group by state,locid grouping sets((state),(locid),()) +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: loc_orc + Statistics: Num rows: 8 Data size: 796 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: state (type: string), locid (type: int) + outputColumnNames: state, locid + Statistics: Num rows: 8 Data size: 796 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + keys: state (type: string), locid (type: int), '0' (type: string) + mode: hash + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 4 Data size: 700 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: string), _col1 (type: int), _col2 (type: string) + sort order: +++ + Map-reduce partition columns: _col0 (type: string), _col1 (type: int), _col2 (type: string) + Statistics: Num rows: 4 Data size: 700 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Operator Tree: + Group By Operator + keys: KEY._col0 (type: string), KEY._col1 (type: int), KEY._col2 (type: string) + mode: mergepartial + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 2 Data size: 350 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: _col0 (type: string), _col1 (type: int) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 2 Data size: 180 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 2 Data size: 180 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: -- map-side GBY numRows: 4 reduce-side GBY numRows: 2 +explain select state,locid from loc_orc group by state,locid grouping sets((state,locid),(state),(locid),()) +PREHOOK: type: QUERY +POSTHOOK: query: -- map-side GBY numRows: 4 reduce-side GBY numRows: 2 +explain select state,locid from loc_orc group by state,locid grouping sets((state,locid),(state),(locid),()) +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: loc_orc + Statistics: Num rows: 8 Data size: 796 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: state (type: string), locid (type: int) + outputColumnNames: state, locid + Statistics: Num rows: 8 Data size: 796 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + keys: state (type: string), locid (type: int), '0' (type: string) + mode: hash + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 4 Data size: 700 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: string), _col1 (type: int), _col2 (type: string) + sort order: +++ + Map-reduce partition columns: _col0 (type: string), _col1 (type: int), _col2 (type: string) + Statistics: Num rows: 4 Data size: 700 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Operator Tree: + Group By Operator + keys: KEY._col0 (type: string), KEY._col1 (type: int), KEY._col2 (type: string) + mode: mergepartial + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 2 Data size: 350 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: _col0 (type: string), _col1 (type: int) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 2 Data size: 180 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 2 Data size: 180 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: -- map-side GBY: numRows: 2 +-- reduce-side GBY: numRows: 1 +explain select year from loc_orc group by year +PREHOOK: type: QUERY +POSTHOOK: query: -- map-side GBY: numRows: 2 +-- reduce-side GBY: numRows: 1 +explain select year from loc_orc group by year +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: loc_orc + Statistics: Num rows: 8 Data size: 796 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: year (type: int) + outputColumnNames: year + Statistics: Num rows: 8 Data size: 796 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + keys: year (type: int) + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 2 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: int) + sort order: + + Map-reduce partition columns: _col0 (type: int) + Statistics: Num rows: 2 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Operator Tree: + Group By Operator + keys: KEY._col0 (type: int) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: _col0 (type: int) + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: -- map-side GBY numRows: 4 reduce-side GBY numRows: 2 +explain select state,locid from loc_orc group by state,locid with cube +PREHOOK: type: QUERY +POSTHOOK: query: -- map-side GBY numRows: 4 reduce-side GBY numRows: 2 +explain select state,locid from loc_orc group by state,locid with cube +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: loc_orc + Statistics: Num rows: 8 Data size: 796 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: state (type: string), locid (type: int) + outputColumnNames: state, locid + Statistics: Num rows: 8 Data size: 796 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + keys: state (type: string), locid (type: int), '0' (type: string) + mode: hash + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 4 Data size: 700 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: string), _col1 (type: int), _col2 (type: string) + sort order: +++ + Map-reduce partition columns: _col0 (type: string), _col1 (type: int), _col2 (type: string) + Statistics: Num rows: 4 Data size: 700 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Operator Tree: + Group By Operator + keys: KEY._col0 (type: string), KEY._col1 (type: int), KEY._col2 (type: string) + mode: mergepartial + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 2 Data size: 350 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: _col0 (type: string), _col1 (type: int) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 2 Data size: 180 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 2 Data size: 180 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: -- map-side GBY numRows: 32 reduce-side GBY numRows: 16 +explain select state,locid from loc_orc group by state,locid with cube +PREHOOK: type: QUERY +POSTHOOK: query: -- map-side GBY numRows: 32 reduce-side GBY numRows: 16 +explain select state,locid from loc_orc group by state,locid with cube +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: loc_orc + Statistics: Num rows: 8 Data size: 796 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: state (type: string), locid (type: int) + outputColumnNames: state, locid + Statistics: Num rows: 8 Data size: 796 Basic stats: COMPLETE Column stats: NONE + Group By Operator + keys: state (type: string), locid (type: int), '0' (type: string) + mode: hash + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 32 Data size: 3184 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string), _col1 (type: int), _col2 (type: string) + sort order: +++ + Map-reduce partition columns: _col0 (type: string), _col1 (type: int), _col2 (type: string) + Statistics: Num rows: 32 Data size: 3184 Basic stats: COMPLETE Column stats: NONE + Reduce Operator Tree: + Group By Operator + keys: KEY._col0 (type: string), KEY._col1 (type: int), KEY._col2 (type: string) + mode: mergepartial + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 16 Data size: 1592 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col1 (type: int) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 16 Data size: 1592 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 16 Data size: 1592 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: -- map-side GBY numRows: 24 reduce-side GBY numRows: 12 +explain select state,locid from loc_orc group by state,locid with rollup +PREHOOK: type: QUERY +POSTHOOK: query: -- map-side GBY numRows: 24 reduce-side GBY numRows: 12 +explain select state,locid from loc_orc group by state,locid with rollup +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: loc_orc + Statistics: Num rows: 8 Data size: 796 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: state (type: string), locid (type: int) + outputColumnNames: state, locid + Statistics: Num rows: 8 Data size: 796 Basic stats: COMPLETE Column stats: NONE + Group By Operator + keys: state (type: string), locid (type: int), '0' (type: string) + mode: hash + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 24 Data size: 2388 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string), _col1 (type: int), _col2 (type: string) + sort order: +++ + Map-reduce partition columns: _col0 (type: string), _col1 (type: int), _col2 (type: string) + Statistics: Num rows: 24 Data size: 2388 Basic stats: COMPLETE Column stats: NONE + Reduce Operator Tree: + Group By Operator + keys: KEY._col0 (type: string), KEY._col1 (type: int), KEY._col2 (type: string) + mode: mergepartial + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 12 Data size: 1194 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col1 (type: int) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 12 Data size: 1194 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 12 Data size: 1194 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: -- map-side GBY numRows: 8 reduce-side GBY numRows: 4 +explain select state,locid from loc_orc group by state,locid grouping sets((state)) +PREHOOK: type: QUERY +POSTHOOK: query: -- map-side GBY numRows: 8 reduce-side GBY numRows: 4 +explain select state,locid from loc_orc group by state,locid grouping sets((state)) +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: loc_orc + Statistics: Num rows: 8 Data size: 796 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: state (type: string), locid (type: int) + outputColumnNames: state, locid + Statistics: Num rows: 8 Data size: 796 Basic stats: COMPLETE Column stats: NONE + Group By Operator + keys: state (type: string), locid (type: int), '0' (type: string) + mode: hash + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 8 Data size: 796 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string), _col1 (type: int), _col2 (type: string) + sort order: +++ + Map-reduce partition columns: _col0 (type: string), _col1 (type: int), _col2 (type: string) + Statistics: Num rows: 8 Data size: 796 Basic stats: COMPLETE Column stats: NONE + Reduce Operator Tree: + Group By Operator + keys: KEY._col0 (type: string), KEY._col1 (type: int), KEY._col2 (type: string) + mode: mergepartial + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 4 Data size: 398 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col1 (type: int) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 4 Data size: 398 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 4 Data size: 398 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: -- map-side GBY numRows: 16 reduce-side GBY numRows: 8 +explain select state,locid from loc_orc group by state,locid grouping sets((state),(locid)) +PREHOOK: type: QUERY +POSTHOOK: query: -- map-side GBY numRows: 16 reduce-side GBY numRows: 8 +explain select state,locid from loc_orc group by state,locid grouping sets((state),(locid)) +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: loc_orc + Statistics: Num rows: 8 Data size: 796 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: state (type: string), locid (type: int) + outputColumnNames: state, locid + Statistics: Num rows: 8 Data size: 796 Basic stats: COMPLETE Column stats: NONE + Group By Operator + keys: state (type: string), locid (type: int), '0' (type: string) + mode: hash + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 16 Data size: 1592 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string), _col1 (type: int), _col2 (type: string) + sort order: +++ + Map-reduce partition columns: _col0 (type: string), _col1 (type: int), _col2 (type: string) + Statistics: Num rows: 16 Data size: 1592 Basic stats: COMPLETE Column stats: NONE + Reduce Operator Tree: + Group By Operator + keys: KEY._col0 (type: string), KEY._col1 (type: int), KEY._col2 (type: string) + mode: mergepartial + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 8 Data size: 796 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col1 (type: int) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 8 Data size: 796 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 8 Data size: 796 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: -- map-side GBY numRows: 24 reduce-side GBY numRows: 12 +explain select state,locid from loc_orc group by state,locid grouping sets((state),(locid),()) +PREHOOK: type: QUERY +POSTHOOK: query: -- map-side GBY numRows: 24 reduce-side GBY numRows: 12 +explain select state,locid from loc_orc group by state,locid grouping sets((state),(locid),()) +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: loc_orc + Statistics: Num rows: 8 Data size: 796 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: state (type: string), locid (type: int) + outputColumnNames: state, locid + Statistics: Num rows: 8 Data size: 796 Basic stats: COMPLETE Column stats: NONE + Group By Operator + keys: state (type: string), locid (type: int), '0' (type: string) + mode: hash + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 24 Data size: 2388 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string), _col1 (type: int), _col2 (type: string) + sort order: +++ + Map-reduce partition columns: _col0 (type: string), _col1 (type: int), _col2 (type: string) + Statistics: Num rows: 24 Data size: 2388 Basic stats: COMPLETE Column stats: NONE + Reduce Operator Tree: + Group By Operator + keys: KEY._col0 (type: string), KEY._col1 (type: int), KEY._col2 (type: string) + mode: mergepartial + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 12 Data size: 1194 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col1 (type: int) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 12 Data size: 1194 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 12 Data size: 1194 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: -- map-side GBY numRows: 32 reduce-side GBY numRows: 16 +explain select state,locid from loc_orc group by state,locid grouping sets((state,locid),(state),(locid),()) +PREHOOK: type: QUERY +POSTHOOK: query: -- map-side GBY numRows: 32 reduce-side GBY numRows: 16 +explain select state,locid from loc_orc group by state,locid grouping sets((state,locid),(state),(locid),()) +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: loc_orc + Statistics: Num rows: 8 Data size: 796 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: state (type: string), locid (type: int) + outputColumnNames: state, locid + Statistics: Num rows: 8 Data size: 796 Basic stats: COMPLETE Column stats: NONE + Group By Operator + keys: state (type: string), locid (type: int), '0' (type: string) + mode: hash + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 32 Data size: 3184 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string), _col1 (type: int), _col2 (type: string) + sort order: +++ + Map-reduce partition columns: _col0 (type: string), _col1 (type: int), _col2 (type: string) + Statistics: Num rows: 32 Data size: 3184 Basic stats: COMPLETE Column stats: NONE + Reduce Operator Tree: + Group By Operator + keys: KEY._col0 (type: string), KEY._col1 (type: int), KEY._col2 (type: string) + mode: mergepartial + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 16 Data size: 1592 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col1 (type: int) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 16 Data size: 1592 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 16 Data size: 1592 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: -- map-side GBY: numRows: 80 +-- reduce-side GBY: numRows: 40 +explain select year from loc_orc group by year +PREHOOK: type: QUERY +POSTHOOK: query: -- map-side GBY: numRows: 80 +-- reduce-side GBY: numRows: 40 +explain select year from loc_orc group by year +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: loc_orc + Statistics: Num rows: 8 Data size: 796 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: year (type: int) + outputColumnNames: year + Statistics: Num rows: 8 Data size: 796 Basic stats: COMPLETE Column stats: NONE + Group By Operator + keys: year (type: int) + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 80 Data size: 7960 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: int) + sort order: + + Map-reduce partition columns: _col0 (type: int) + Statistics: Num rows: 80 Data size: 7960 Basic stats: COMPLETE Column stats: NONE + Reduce Operator Tree: + Group By Operator + keys: KEY._col0 (type: int) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 40 Data size: 3980 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: int) + outputColumnNames: _col0 + Statistics: Num rows: 40 Data size: 3980 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 40 Data size: 3980 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: -- map-side GBY numRows: 320 reduce-side GBY numRows: 160 +explain select state,locid from loc_orc group by state,locid with cube +PREHOOK: type: QUERY +POSTHOOK: query: -- map-side GBY numRows: 320 reduce-side GBY numRows: 160 +explain select state,locid from loc_orc group by state,locid with cube +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: loc_orc + Statistics: Num rows: 8 Data size: 796 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: state (type: string), locid (type: int) + outputColumnNames: state, locid + Statistics: Num rows: 8 Data size: 796 Basic stats: COMPLETE Column stats: NONE + Group By Operator + keys: state (type: string), locid (type: int), '0' (type: string) + mode: hash + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 320 Data size: 31840 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string), _col1 (type: int), _col2 (type: string) + sort order: +++ + Map-reduce partition columns: _col0 (type: string), _col1 (type: int), _col2 (type: string) + Statistics: Num rows: 320 Data size: 31840 Basic stats: COMPLETE Column stats: NONE + Reduce Operator Tree: + Group By Operator + keys: KEY._col0 (type: string), KEY._col1 (type: int), KEY._col2 (type: string) + mode: mergepartial + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 160 Data size: 15920 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col1 (type: int) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 160 Data size: 15920 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 160 Data size: 15920 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink +