Index: ql/src/test/results/clientpositive/groupby_sort_11.q.out =================================================================== --- ql/src/test/results/clientpositive/groupby_sort_11.q.out (revision 0) +++ ql/src/test/results/clientpositive/groupby_sort_11.q.out (working copy) @@ -0,0 +1,641 @@ +PREHOOK: query: CREATE TABLE T1(key STRING, val STRING) PARTITIONED BY (ds string) +CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE T1(key STRING, val STRING) PARTITIONED BY (ds string) +CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@T1 +PREHOOK: query: INSERT OVERWRITE TABLE T1 PARTITION (ds='1') +SELECT * from src where key < 10 +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: default@t1@ds=1 +POSTHOOK: query: INSERT OVERWRITE TABLE T1 PARTITION (ds='1') +SELECT * from src where key < 10 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: default@t1@ds=1 +POSTHOOK: Lineage: t1 PARTITION(ds=1).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: t1 PARTITION(ds=1).val SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: EXPLAIN select count(distinct key) from T1 +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN select count(distinct key) from T1 +POSTHOOK: type: QUERY +POSTHOOK: Lineage: t1 PARTITION(ds=1).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: t1 PARTITION(ds=1).val SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTIONDI count (TOK_TABLE_OR_COL key)))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + t1 + TableScan + alias: t1 + Select Operator + expressions: + expr: key + type: string + outputColumnNames: key + Group By Operator + aggregations: + expr: count(DISTINCT key) + bucketGroup: true + keys: + expr: key + type: string + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + tag: -1 + value expressions: + expr: _col1 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: bigint + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: select count(distinct key) from T1 +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +PREHOOK: Input: default@t1@ds=1 +#### A masked pattern was here #### +POSTHOOK: query: select count(distinct key) from T1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +POSTHOOK: Input: default@t1@ds=1 +#### A masked pattern was here #### +POSTHOOK: Lineage: t1 PARTITION(ds=1).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: t1 PARTITION(ds=1).val SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +6 +PREHOOK: query: EXPLAIN select count(distinct key), count(1), count(key), sum(distinct key) from T1 +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN select count(distinct key), count(1), count(key), sum(distinct key) from T1 +POSTHOOK: type: QUERY +POSTHOOK: Lineage: t1 PARTITION(ds=1).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: t1 PARTITION(ds=1).val SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTIONDI count (TOK_TABLE_OR_COL key))) (TOK_SELEXPR (TOK_FUNCTION count 1)) (TOK_SELEXPR (TOK_FUNCTION count (TOK_TABLE_OR_COL key))) (TOK_SELEXPR (TOK_FUNCTIONDI sum (TOK_TABLE_OR_COL key)))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + t1 + TableScan + alias: t1 + Select Operator + expressions: + expr: key + type: string + outputColumnNames: key + Group By Operator + aggregations: + expr: count(DISTINCT key) + expr: count(1) + expr: count(key) + expr: sum(DISTINCT key) + bucketGroup: true + keys: + expr: key + type: string + mode: hash + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + tag: -1 + value expressions: + expr: _col1 + type: bigint + expr: _col2 + type: bigint + expr: _col3 + type: bigint + expr: _col4 + type: double + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + expr: count(VALUE._col1) + expr: count(VALUE._col2) + expr: sum(VALUE._col3) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0, _col1, _col2, _col3 + Select Operator + expressions: + expr: _col0 + type: bigint + expr: _col1 + type: bigint + expr: _col2 + type: bigint + expr: _col3 + type: double + outputColumnNames: _col0, _col1, _col2, _col3 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: select count(distinct key), count(1), count(key), sum(distinct key) from T1 +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +PREHOOK: Input: default@t1@ds=1 +#### A masked pattern was here #### +POSTHOOK: query: select count(distinct key), count(1), count(key), sum(distinct key) from T1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +POSTHOOK: Input: default@t1@ds=1 +#### A masked pattern was here #### +POSTHOOK: Lineage: t1 PARTITION(ds=1).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: t1 PARTITION(ds=1).val SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +6 10 10 28.0 +PREHOOK: query: EXPLAIN select count(distinct key), count(1), count(key), sum(distinct key) from T1 group by key +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN select count(distinct key), count(1), count(key), sum(distinct key) from T1 group by key +POSTHOOK: type: QUERY +POSTHOOK: Lineage: t1 PARTITION(ds=1).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: t1 PARTITION(ds=1).val SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTIONDI count (TOK_TABLE_OR_COL key))) (TOK_SELEXPR (TOK_FUNCTION count 1)) (TOK_SELEXPR (TOK_FUNCTION count (TOK_TABLE_OR_COL key))) (TOK_SELEXPR (TOK_FUNCTIONDI sum (TOK_TABLE_OR_COL key)))) (TOK_GROUPBY (TOK_TABLE_OR_COL key)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + t1 + TableScan + alias: t1 + Select Operator + expressions: + expr: key + type: string + outputColumnNames: key + Group By Operator + aggregations: + expr: count(DISTINCT key) + expr: count(1) + expr: count(key) + expr: sum(DISTINCT key) + bucketGroup: true + keys: + expr: key + type: string + mode: hash + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: ++ + Map-reduce partition columns: + expr: _col0 + type: string + tag: -1 + value expressions: + expr: _col1 + type: bigint + expr: _col2 + type: bigint + expr: _col3 + type: bigint + expr: _col4 + type: double + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(DISTINCT KEY._col1:0._col0) + expr: count(VALUE._col1) + expr: count(VALUE._col2) + expr: sum(DISTINCT KEY._col1:1._col0) + bucketGroup: false + keys: + expr: KEY._col1:1._col0 + type: string + mode: mergepartial + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Select Operator + expressions: + expr: _col1 + type: bigint + expr: _col2 + type: bigint + expr: _col3 + type: bigint + expr: _col4 + type: double + outputColumnNames: _col0, _col1, _col2, _col3 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: select count(distinct key), count(1), count(key), sum(distinct key) from T1 group by key +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +PREHOOK: Input: default@t1@ds=1 +#### A masked pattern was here #### +POSTHOOK: query: select count(distinct key), count(1), count(key), sum(distinct key) from T1 group by key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +POSTHOOK: Input: default@t1@ds=1 +#### A masked pattern was here #### +POSTHOOK: Lineage: t1 PARTITION(ds=1).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: t1 PARTITION(ds=1).val SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +1 3 3 0.0 +1 1 1 2.0 +1 1 1 4.0 +1 3 3 5.0 +1 1 1 8.0 +1 1 1 9.0 +PREHOOK: query: EXPLAIN select key, count(distinct key), count(1), count(key), sum(distinct key) from T1 group by key +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN select key, count(distinct key), count(1), count(key), sum(distinct key) from T1 group by key +POSTHOOK: type: QUERY +POSTHOOK: Lineage: t1 PARTITION(ds=1).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: t1 PARTITION(ds=1).val SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTIONDI count (TOK_TABLE_OR_COL key))) (TOK_SELEXPR (TOK_FUNCTION count 1)) (TOK_SELEXPR (TOK_FUNCTION count (TOK_TABLE_OR_COL key))) (TOK_SELEXPR (TOK_FUNCTIONDI sum (TOK_TABLE_OR_COL key)))) (TOK_GROUPBY (TOK_TABLE_OR_COL key)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + t1 + TableScan + alias: t1 + Select Operator + expressions: + expr: key + type: string + outputColumnNames: key + Group By Operator + aggregations: + expr: count(DISTINCT key) + expr: count(1) + expr: count(key) + expr: sum(DISTINCT key) + bucketGroup: true + keys: + expr: key + type: string + mode: hash + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: ++ + Map-reduce partition columns: + expr: _col0 + type: string + tag: -1 + value expressions: + expr: _col1 + type: bigint + expr: _col2 + type: bigint + expr: _col3 + type: bigint + expr: _col4 + type: double + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(DISTINCT KEY._col1:0._col0) + expr: count(VALUE._col1) + expr: count(VALUE._col2) + expr: sum(DISTINCT KEY._col1:1._col0) + bucketGroup: false + keys: + expr: KEY._col1:1._col0 + type: string + mode: mergepartial + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + expr: _col2 + type: bigint + expr: _col3 + type: bigint + expr: _col4 + type: double + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: select key, count(distinct key), count(1), count(key), sum(distinct key) from T1 group by key +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +PREHOOK: Input: default@t1@ds=1 +#### A masked pattern was here #### +POSTHOOK: query: select key, count(distinct key), count(1), count(key), sum(distinct key) from T1 group by key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +POSTHOOK: Input: default@t1@ds=1 +#### A masked pattern was here #### +POSTHOOK: Lineage: t1 PARTITION(ds=1).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: t1 PARTITION(ds=1).val SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +0 1 3 3 0.0 +2 1 1 1 2.0 +4 1 1 1 4.0 +5 1 3 3 5.0 +8 1 1 1 8.0 +9 1 1 1 9.0 +PREHOOK: query: EXPLAIN select count(distinct key+key) from T1 +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN select count(distinct key+key) from T1 +POSTHOOK: type: QUERY +POSTHOOK: Lineage: t1 PARTITION(ds=1).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: t1 PARTITION(ds=1).val SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTIONDI count (+ (TOK_TABLE_OR_COL key) (TOK_TABLE_OR_COL key))))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + t1 + TableScan + alias: t1 + Select Operator + expressions: + expr: key + type: string + outputColumnNames: key + Group By Operator + aggregations: + expr: count(DISTINCT (key + key)) + bucketGroup: false + keys: + expr: (key + key) + type: double + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: double + sort order: + + tag: -1 + value expressions: + expr: _col1 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(DISTINCT KEY._col0:0._col0) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: bigint + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: select count(distinct key+key) from T1 +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +PREHOOK: Input: default@t1@ds=1 +#### A masked pattern was here #### +POSTHOOK: query: select count(distinct key+key) from T1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +POSTHOOK: Input: default@t1@ds=1 +#### A masked pattern was here #### +POSTHOOK: Lineage: t1 PARTITION(ds=1).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: t1 PARTITION(ds=1).val SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +6 +PREHOOK: query: EXPLAIN select count(distinct 1) from T1 +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN select count(distinct 1) from T1 +POSTHOOK: type: QUERY +POSTHOOK: Lineage: t1 PARTITION(ds=1).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: t1 PARTITION(ds=1).val SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTIONDI count 1))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + t1 + TableScan + alias: t1 + Select Operator + Group By Operator + aggregations: + expr: count(DISTINCT 1) + bucketGroup: false + keys: + expr: 1 + type: int + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: int + sort order: + + tag: -1 + value expressions: + expr: _col1 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(DISTINCT KEY._col0:0._col0) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: bigint + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: select count(distinct 1) from T1 +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +PREHOOK: Input: default@t1@ds=1 +#### A masked pattern was here #### +POSTHOOK: query: select count(distinct 1) from T1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +POSTHOOK: Input: default@t1@ds=1 +#### A masked pattern was here #### +POSTHOOK: Lineage: t1 PARTITION(ds=1).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: t1 PARTITION(ds=1).val SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +1 +PREHOOK: query: EXPLAIN select count(distinct key) from T1 +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN select count(distinct key) from T1 +POSTHOOK: type: QUERY +POSTHOOK: Lineage: t1 PARTITION(ds=1).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: t1 PARTITION(ds=1).val SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTIONDI count (TOK_TABLE_OR_COL key)))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + t1 + TableScan + alias: t1 + Select Operator + expressions: + expr: key + type: string + outputColumnNames: key + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + tag: -1 + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(DISTINCT KEY._col0:0._col0) + bucketGroup: false + mode: complete + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: bigint + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: select count(distinct key) from T1 +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +PREHOOK: Input: default@t1@ds=1 +#### A masked pattern was here #### +POSTHOOK: query: select count(distinct key) from T1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +POSTHOOK: Input: default@t1@ds=1 +#### A masked pattern was here #### +POSTHOOK: Lineage: t1 PARTITION(ds=1).key SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: t1 PARTITION(ds=1).val SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +6 Index: ql/src/test/queries/clientpositive/groupby_sort_11.q =================================================================== --- ql/src/test/queries/clientpositive/groupby_sort_11.q (revision 0) +++ ql/src/test/queries/clientpositive/groupby_sort_11.q (working copy) @@ -0,0 +1,40 @@ +set hive.enforce.bucketing = true; +set hive.enforce.sorting = true; +set hive.exec.reducers.max = 1; +set hive.map.groupby.sorted=true; + +CREATE TABLE T1(key STRING, val STRING) PARTITIONED BY (ds string) +CLUSTERED BY (key) SORTED BY (key) INTO 2 BUCKETS; + +-- perform an insert to make sure there are 2 files +INSERT OVERWRITE TABLE T1 PARTITION (ds='1') +SELECT * from src where key < 10; + +-- The plan is optimized to perform partial aggregation on the mapper +EXPLAIN select count(distinct key) from T1; +select count(distinct key) from T1; + +-- The plan is optimized to perform partial aggregation on the mapper +EXPLAIN select count(distinct key), count(1), count(key), sum(distinct key) from T1; +select count(distinct key), count(1), count(key), sum(distinct key) from T1; + +-- The plan is not changed in the presence of a grouping key +EXPLAIN select count(distinct key), count(1), count(key), sum(distinct key) from T1 group by key; +select count(distinct key), count(1), count(key), sum(distinct key) from T1 group by key; + +-- The plan is not changed in the presence of a grouping key +EXPLAIN select key, count(distinct key), count(1), count(key), sum(distinct key) from T1 group by key; +select key, count(distinct key), count(1), count(key), sum(distinct key) from T1 group by key; + +-- The plan is not changed in the presence of a grouping key expression +EXPLAIN select count(distinct key+key) from T1; +select count(distinct key+key) from T1; + +EXPLAIN select count(distinct 1) from T1; +select count(distinct 1) from T1; + +set hive.map.aggr=false; + +-- no plan change if map aggr is turned off +EXPLAIN select count(distinct key) from T1; +select count(distinct key) from T1; Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/GroupByOptimizer.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/GroupByOptimizer.java (revision 1465924) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/GroupByOptimizer.java (working copy) @@ -53,6 +53,7 @@ import org.apache.hadoop.hive.ql.parse.ParseContext; import org.apache.hadoop.hive.ql.parse.PrunedPartitionList; import org.apache.hadoop.hive.ql.parse.SemanticException; +import org.apache.hadoop.hive.ql.plan.AggregationDesc; import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc; import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc; import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; @@ -60,6 +61,7 @@ import org.apache.hadoop.hive.ql.plan.GroupByDesc; import org.apache.hadoop.hive.ql.plan.OperatorDesc; import org.apache.hadoop.hive.ql.plan.SelectDesc; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator.Mode; import org.apache.hadoop.util.StringUtils; /** @@ -107,7 +109,7 @@ GraphWalker ogw = new DefaultGraphWalker(disp); // Create a list of topop nodes - ArrayList topNodes = new ArrayList(); + List topNodes = new ArrayList(); topNodes.addAll(pctx.getTopOps().values()); ogw.startWalking(topNodes, null); @@ -174,16 +176,79 @@ GroupByOptimizerSortMatch match = checkSortGroupBy(stack, groupByOp); boolean useMapperSort = HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVE_MAP_GROUPBY_SORT); + GroupByDesc groupByOpDesc = groupByOp.getConf(); + boolean removeReduceSink = false; + boolean optimizeDistincts = false; + boolean setBucketGroup = false; + // Dont remove the operator for distincts - if (useMapperSort && !groupByOp.getConf().isDistinct() && + if (useMapperSort && (match == GroupByOptimizerSortMatch.COMPLETE_MATCH)) { - convertGroupByMapSideSortedGroupBy(hiveConf, groupByOp, depth); + if (!groupByOpDesc.isDistinct()) { + removeReduceSink = true; + } + else if (!HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVEGROUPBYSKEW)) { + // Optimize the query: select count(distinct keys) from T, where + // T is bucketized and sorted by T + // Partial aggregation can be done by the mappers in this scenario + + List keys = + ((GroupByOperator) + (groupByOp.getChildOperators().get(0).getChildOperators().get(0))) + .getConf().getKeys(); + if ((keys == null) || (keys.isEmpty())) { + optimizeDistincts = true; + } + } } - else if ((match == GroupByOptimizerSortMatch.PARTIAL_MATCH) || + + if ((match == GroupByOptimizerSortMatch.PARTIAL_MATCH) || (match == GroupByOptimizerSortMatch.COMPLETE_MATCH)) { - groupByOp.getConf().setBucketGroup(true); + setBucketGroup = true; } + + if (removeReduceSink) { + convertGroupByMapSideSortedGroupBy(hiveConf, groupByOp, depth); + } + else if (optimizeDistincts) { + ReduceSinkOperator reduceSinkOp = + (ReduceSinkOperator)groupByOp.getChildOperators().get(0); + GroupByDesc childGroupByDesc = + ((GroupByOperator) + (reduceSinkOp.getChildOperators().get(0))).getConf(); + + for (int pos = 0; pos < childGroupByDesc.getAggregators().size(); pos++) { + AggregationDesc aggr = childGroupByDesc.getAggregators().get(pos); + // Partial aggregation is not done for distincts on the mapper + // However, if the data is bucketed/sorted on the distinct key, partial aggregation + // can be performed on the mapper. + if (aggr.getDistinct()) { + ArrayList parameters = new ArrayList(); + ExprNodeDesc param = aggr.getParameters().get(0); + assert param instanceof ExprNodeColumnDesc; + ExprNodeColumnDesc paramC = (ExprNodeColumnDesc) param; + paramC.setIsPartitionColOrVirtualCol(false); + paramC.setColumn("VALUE._col" + pos); + parameters.add(paramC); + aggr.setParameters(parameters); + aggr.setDistinct(false); + aggr.setMode(Mode.FINAL); + } + } + // Partial aggregation is performed on the mapper, no distinct processing at the reducer + childGroupByDesc.setDistinct(false); + groupByOpDesc.setDontResetAggrsDistinct(true); + groupByOpDesc.setBucketGroup(true); + groupByOp.setUseBucketizedHiveInputFormat(true); + // no distinct processing at the reducer + // A query like 'select count(distinct key) from T' is transformed into + // 'select count(key) from T' as far as the reducer is concerned. + reduceSinkOp.getConf().setDistinctColumnIndices(new ArrayList>()); + } + else if (setBucketGroup) { + groupByOpDesc.setBucketGroup(true); + } } @Override @@ -339,8 +404,8 @@ GroupByOptimizerSortMatch currentMatch = notDeniedPartns.isEmpty() ? GroupByOptimizerSortMatch.NO_MATCH : - notDeniedPartns.size() > 1 ? GroupByOptimizerSortMatch.PARTIAL_MATCH : - GroupByOptimizerSortMatch.COMPLETE_MATCH; + notDeniedPartns.size() > 1 ? GroupByOptimizerSortMatch.PARTIAL_MATCH : + GroupByOptimizerSortMatch.COMPLETE_MATCH; for (Partition part : notDeniedPartns) { List sortCols = part.getSortColNames(); List bucketCols = part.getBucketCols(); @@ -440,8 +505,9 @@ case NO_MATCH: return GroupByOptimizerSortMatch.NO_MATCH; case COMPLETE_MATCH: - return ((bucketCols != null) && !bucketCols.isEmpty() && sortCols.containsAll(bucketCols)) ? - GroupByOptimizerSortMatch.COMPLETE_MATCH : GroupByOptimizerSortMatch.PARTIAL_MATCH; + return ((bucketCols != null) && !bucketCols.isEmpty() && + sortCols.containsAll(bucketCols)) ? + GroupByOptimizerSortMatch.COMPLETE_MATCH : GroupByOptimizerSortMatch.PARTIAL_MATCH; case PREFIX_COL1_MATCH: return GroupByOptimizerSortMatch.NO_MATCH; case PREFIX_COL2_MATCH: Index: ql/src/java/org/apache/hadoop/hive/ql/exec/GroupByOperator.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/exec/GroupByOperator.java (revision 1465924) +++ ql/src/java/org/apache/hadoop/hive/ql/exec/GroupByOperator.java (working copy) @@ -58,12 +58,12 @@ import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.ObjectInspectorCopyOption; +import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory; import org.apache.hadoop.hive.serde2.objectinspector.StandardStructObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.StructField; import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.UnionObject; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.ObjectInspectorCopyOption; -import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory; import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; @@ -90,8 +90,6 @@ protected transient ObjectInspector[][] aggregationParameterObjectInspectors; protected transient ObjectInspector[][] aggregationParameterStandardObjectInspectors; protected transient Object[][] aggregationParameterObjects; - // In the future, we may allow both count(DISTINCT a) and sum(DISTINCT a) in - // the same SQL clause, // so aggregationIsDistinct is a boolean array instead of a single number. protected transient boolean[] aggregationIsDistinct; // Map from integer tag to distinct aggrs @@ -887,8 +885,15 @@ // Forward the current keys if needed for sort-based aggregation if (currentKeys != null && !keysAreEqual) { - forward(currentKeys.getKeyArray(), aggregations); - countAfterReport = 0; + // This is to optimize queries of the form: + // select count(distinct key) from T + // where T is sorted and bucketized by key + // Partial aggregation is performed on the mapper, and the + // reducer gets 1 row (partial result) per mapper. + if (!conf.isDontResetAggrsDistinct()) { + forward(currentKeys.getKeyArray(), aggregations); + countAfterReport = 0; + } } // Need to update the keys? @@ -900,7 +905,10 @@ } // Reset the aggregations - resetAggregations(aggregations); + // For distincts optimization with sorting/bucketing, perform partial aggregation + if (!conf.isDontResetAggrsDistinct()) { + resetAggregations(aggregations); + } // clear parameters in last-invoke for (int i = 0; i < aggregationsParametersLastInvoke.length; i++) { Index: ql/src/java/org/apache/hadoop/hive/ql/plan/GroupByDesc.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/plan/GroupByDesc.java (revision 1465924) +++ ql/src/java/org/apache/hadoop/hive/ql/plan/GroupByDesc.java (working copy) @@ -35,13 +35,13 @@ * PARTIAL1: partial aggregation - first phase: iterate, terminatePartial * PARTIAL2: partial aggregation - second phase: merge, terminatePartial * PARTIALS: For non-distinct the same as PARTIAL2, for distinct the same as - * PARTIAL1 + * PARTIAL1 * FINAL: partial aggregation - final phase: merge, terminate * HASH: For non-distinct the same as PARTIAL1 but use hash-table-based aggregation * MERGEPARTIAL: FINAL for non-distinct aggregations, COMPLETE for distinct * aggregations. */ - private static final long serialVersionUID = 1L; + private static long serialVersionUID = 1L; /** * Mode. @@ -66,6 +66,7 @@ private float groupByMemoryUsage; private float memoryThreshold; transient private boolean isDistinct; + private boolean dontResetAggrsDistinct; public GroupByDesc() { } @@ -83,8 +84,8 @@ final int groupingSetsPosition, final boolean isDistinct) { this(mode, outputColumnNames, keys, aggregators, groupKeyNotReductionKey, - false, groupByMemoryUsage, memoryThreshold, listGroupingSets, - groupingSetsPresent, groupingSetsPosition, isDistinct); + false, groupByMemoryUsage, memoryThreshold, listGroupingSets, + groupingSetsPresent, groupingSetsPosition, isDistinct); } public GroupByDesc( @@ -212,11 +213,11 @@ */ public boolean isDistinctLike() { ArrayList aggregators = getAggregators(); - for(AggregationDesc ad: aggregators){ - if(!ad.getDistinct()) { + for (AggregationDesc ad : aggregators) { + if (!ad.getDistinct()) { GenericUDAFEvaluator udafEval = ad.getGenericUDAFEvaluator(); UDFType annot = udafEval.getClass().getAnnotation(UDFType.class); - if(annot == null || !annot.distinctLike()) { + if (annot == null || !annot.distinctLike()) { return false; } } @@ -257,4 +258,16 @@ public boolean isDistinct() { return isDistinct; } + + public void setDistinct(boolean isDistinct) { + this.isDistinct = isDistinct; + } + + public boolean isDontResetAggrsDistinct() { + return dontResetAggrsDistinct; + } + + public void setDontResetAggrsDistinct(boolean dontResetAggrsDistinct) { + this.dontResetAggrsDistinct = dontResetAggrsDistinct; + } }