Index: conf/hive-default.xml.template =================================================================== --- conf/hive-default.xml.template (revision 1379313) +++ conf/hive-default.xml.template (working copy) @@ -414,6 +414,12 @@ + hive.optimize.skewjoinoptimization + false + Whether to modify the operator tree to optimize skewed join + + + hive.multigroupby.singlemr false Whether to optimize multi group by query to generate single M/R Index: common/src/java/org/apache/hadoop/hive/conf/HiveConf.java =================================================================== --- common/src/java/org/apache/hadoop/hive/conf/HiveConf.java (revision 1379313) +++ common/src/java/org/apache/hadoop/hive/conf/HiveConf.java (working copy) @@ -489,6 +489,7 @@ HIVEOPTBUCKETMAPJOIN("hive.optimize.bucketmapjoin", false), // optimize bucket map join HIVEOPTSORTMERGEBUCKETMAPJOIN("hive.optimize.bucketmapjoin.sortedmerge", false), // try to use sorted merge bucket map join HIVEOPTREDUCEDEDUPLICATION("hive.optimize.reducededuplication", true), + HIVEOPTSKEWJOIN("hive.optimize.skewjoinoptimization", false), // optimize skewed join // Indexes HIVEOPTINDEXFILTER_COMPACT_MINSIZE("hive.optimize.index.filter.compact.minsize", (long) 5 * 1024 * 1024 * 1024), // 5G Index: ql/src/test/results/clientpositive/skewjoinopt8.q.out =================================================================== --- ql/src/test/results/clientpositive/skewjoinopt8.q.out (revision 0) +++ ql/src/test/results/clientpositive/skewjoinopt8.q.out (working copy) @@ -0,0 +1,293 @@ +PREHOOK: query: CREATE TABLE T1(key STRING, val STRING) STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE T1(key STRING, val STRING) STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@T1 +PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1 +PREHOOK: type: LOAD +PREHOOK: Output: default@t1 +POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1 +POSTHOOK: type: LOAD +POSTHOOK: Output: default@t1 +PREHOOK: query: CREATE TABLE T2(key STRING, val STRING) +SKEWED BY (key) ON ((3), (8)) STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE T2(key STRING, val STRING) +SKEWED BY (key) ON ((3), (8)) STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@T2 +PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE T2 +PREHOOK: type: LOAD +PREHOOK: Output: default@t2 +POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE T2 +POSTHOOK: type: LOAD +POSTHOOK: Output: default@t2 +PREHOOK: query: CREATE TABLE T3(key STRING, val STRING) STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE T3(key STRING, val STRING) STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@T3 +PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T3.txt' INTO TABLE T3 +PREHOOK: type: LOAD +PREHOOK: Output: default@t3 +POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T3.txt' INTO TABLE T3 +POSTHOOK: type: LOAD +POSTHOOK: Output: default@t3 +PREHOOK: query: EXPLAIN +SELECT a.*, b.*, c.* FROM T1 a JOIN T2 b ON a.key = b.key JOIN T3 c on a.key = c.key +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN +SELECT a.*, b.*, c.* FROM T1 a JOIN T2 b ON a.key = b.key JOIN T3 c on a.key = c.key +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_JOIN (TOK_TABREF (TOK_TABNAME T1) a) (TOK_TABREF (TOK_TABNAME T2) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key))) (TOK_TABREF (TOK_TABNAME T3) c) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL c) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_ALLCOLREF (TOK_TABNAME a))) (TOK_SELEXPR (TOK_ALLCOLREF (TOK_TABNAME b))) (TOK_SELEXPR (TOK_ALLCOLREF (TOK_TABNAME c)))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1, Stage-5 + Stage-5 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + subquery1:a + TableScan + alias: a + Filter Operator + predicate: + expr: (not ((key = '3') or (key = '8'))) + type: boolean + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 0 + value expressions: + expr: key + type: string + expr: val + type: string + subquery1:b + TableScan + alias: b + Filter Operator + predicate: + expr: (not ((key = '3') or (key = '8'))) + type: boolean + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 1 + value expressions: + expr: key + type: string + expr: val + type: string + subquery1:c + TableScan + alias: c + Filter Operator + predicate: + expr: (not ((key = '3') or (key = '8'))) + type: boolean + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 2 + value expressions: + expr: key + type: string + expr: val + type: string + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + Inner Join 0 to 2 + condition expressions: + 0 {VALUE._col0} {VALUE._col1} + 1 {VALUE._col0} {VALUE._col1} + 2 {VALUE._col0} {VALUE._col1} + handleSkewJoin: false + outputColumnNames: _col0, _col1, _col4, _col5, _col8, _col9 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: string + expr: _col4 + type: string + expr: _col5 + type: string + expr: _col8 + type: string + expr: _col9 + type: string + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + TableScan + Union + Select Operator + SELECT * : (no compute) + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat +#### A masked pattern was here #### + TableScan + Union + Select Operator + SELECT * : (no compute) + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-5 + Map Reduce + Alias -> Map Operator Tree: + a + TableScan + alias: a + Filter Operator + predicate: + expr: ((key = '3') or (key = '8')) + type: boolean + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 0 + value expressions: + expr: key + type: string + expr: val + type: string + b + TableScan + alias: b + Filter Operator + predicate: + expr: ((key = '3') or (key = '8')) + type: boolean + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 1 + value expressions: + expr: key + type: string + expr: val + type: string + c + TableScan + alias: c + Filter Operator + predicate: + expr: ((key = '3') or (key = '8')) + type: boolean + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 2 + value expressions: + expr: key + type: string + expr: val + type: string + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + Inner Join 0 to 2 + condition expressions: + 0 {VALUE._col0} {VALUE._col1} + 1 {VALUE._col0} {VALUE._col1} + 2 {VALUE._col0} {VALUE._col1} + handleSkewJoin: false + outputColumnNames: _col0, _col1, _col4, _col5, _col8, _col9 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: string + expr: _col4 + type: string + expr: _col5 + type: string + expr: _col8 + type: string + expr: _col9 + type: string + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: SELECT a.*, b.*, c.* FROM T1 a JOIN T2 b ON a.key = b.key JOIN T3 c on a.key = c.key +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +PREHOOK: Input: default@t2 +PREHOOK: Input: default@t3 +#### A masked pattern was here #### +POSTHOOK: query: SELECT a.*, b.*, c.* FROM T1 a JOIN T2 b ON a.key = b.key JOIN T3 c on a.key = c.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +POSTHOOK: Input: default@t2 +POSTHOOK: Input: default@t3 +#### A masked pattern was here #### +2 12 2 22 2 12 Index: ql/src/test/results/clientpositive/skewjoinopt3.q.out =================================================================== --- ql/src/test/results/clientpositive/skewjoinopt3.q.out (revision 0) +++ ql/src/test/results/clientpositive/skewjoinopt3.q.out (working copy) @@ -0,0 +1,448 @@ +PREHOOK: query: CREATE TABLE T1(key STRING, val STRING) +SKEWED BY (key) ON ((2), (8)) STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE T1(key STRING, val STRING) +SKEWED BY (key) ON ((2), (8)) STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@T1 +PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1 +PREHOOK: type: LOAD +PREHOOK: Output: default@t1 +POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1 +POSTHOOK: type: LOAD +POSTHOOK: Output: default@t1 +PREHOOK: query: CREATE TABLE T2(key STRING, val STRING) +SKEWED BY (key) ON ((3), (8)) STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE T2(key STRING, val STRING) +SKEWED BY (key) ON ((3), (8)) STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@T2 +PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE T2 +PREHOOK: type: LOAD +PREHOOK: Output: default@t2 +POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE T2 +POSTHOOK: type: LOAD +POSTHOOK: Output: default@t2 +PREHOOK: query: EXPLAIN +SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN +SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME T1) a) (TOK_TABREF (TOK_TABNAME T2) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_ALLCOLREF (TOK_TABNAME a))) (TOK_SELEXPR (TOK_ALLCOLREF (TOK_TABNAME b)))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1, Stage-4 + Stage-4 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + subquery1:a + TableScan + alias: a + Filter Operator + predicate: + expr: (not (((key = '2') or (key = '8')) or (key = '3'))) + type: boolean + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 0 + value expressions: + expr: key + type: string + expr: val + type: string + subquery1:b + TableScan + alias: b + Filter Operator + predicate: + expr: (not (((key = '2') or (key = '8')) or (key = '3'))) + type: boolean + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 1 + value expressions: + expr: key + type: string + expr: val + type: string + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {VALUE._col0} {VALUE._col1} + 1 {VALUE._col0} {VALUE._col1} + handleSkewJoin: false + outputColumnNames: _col0, _col1, _col4, _col5 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: string + expr: _col4 + type: string + expr: _col5 + type: string + outputColumnNames: _col0, _col1, _col2, _col3 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + TableScan + Union + Select Operator + SELECT * : (no compute) + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat +#### A masked pattern was here #### + TableScan + Union + Select Operator + SELECT * : (no compute) + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-4 + Map Reduce + Alias -> Map Operator Tree: + a + TableScan + alias: a + Filter Operator + predicate: + expr: (((key = '2') or (key = '8')) or (key = '3')) + type: boolean + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 0 + value expressions: + expr: key + type: string + expr: val + type: string + b + TableScan + alias: b + Filter Operator + predicate: + expr: (((key = '2') or (key = '8')) or (key = '3')) + type: boolean + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 1 + value expressions: + expr: key + type: string + expr: val + type: string + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {VALUE._col0} {VALUE._col1} + 1 {VALUE._col0} {VALUE._col1} + handleSkewJoin: false + outputColumnNames: _col0, _col1, _col4, _col5 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: string + expr: _col4 + type: string + expr: _col5 + type: string + outputColumnNames: _col0, _col1, _col2, _col3 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +PREHOOK: Input: default@t2 +#### A masked pattern was here #### +POSTHOOK: query: SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +POSTHOOK: Input: default@t2 +#### A masked pattern was here #### +2 12 2 22 +3 13 3 13 +8 18 8 18 +8 18 8 18 +8 28 8 18 +8 28 8 18 +PREHOOK: query: -- test outer joins also + +EXPLAIN +SELECT a.*, b.* FROM T1 a FULL OUTER JOIN T2 b ON a.key = b.key +PREHOOK: type: QUERY +POSTHOOK: query: -- test outer joins also + +EXPLAIN +SELECT a.*, b.* FROM T1 a FULL OUTER JOIN T2 b ON a.key = b.key +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_FULLOUTERJOIN (TOK_TABREF (TOK_TABNAME T1) a) (TOK_TABREF (TOK_TABNAME T2) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_ALLCOLREF (TOK_TABNAME a))) (TOK_SELEXPR (TOK_ALLCOLREF (TOK_TABNAME b)))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1, Stage-4 + Stage-4 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + subquery1:a + TableScan + alias: a + Filter Operator + predicate: + expr: (not (((key = '2') or (key = '8')) or (key = '3'))) + type: boolean + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 0 + value expressions: + expr: key + type: string + expr: val + type: string + subquery1:b + TableScan + alias: b + Filter Operator + predicate: + expr: (not (((key = '2') or (key = '8')) or (key = '3'))) + type: boolean + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 1 + value expressions: + expr: key + type: string + expr: val + type: string + Reduce Operator Tree: + Join Operator + condition map: + Outer Join 0 to 1 + condition expressions: + 0 {VALUE._col0} {VALUE._col1} + 1 {VALUE._col0} {VALUE._col1} + handleSkewJoin: false + outputColumnNames: _col0, _col1, _col4, _col5 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: string + expr: _col4 + type: string + expr: _col5 + type: string + outputColumnNames: _col0, _col1, _col2, _col3 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + TableScan + Union + Select Operator + SELECT * : (no compute) + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat +#### A masked pattern was here #### + TableScan + Union + Select Operator + SELECT * : (no compute) + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-4 + Map Reduce + Alias -> Map Operator Tree: + a + TableScan + alias: a + Filter Operator + predicate: + expr: (((key = '2') or (key = '8')) or (key = '3')) + type: boolean + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 0 + value expressions: + expr: key + type: string + expr: val + type: string + b + TableScan + alias: b + Filter Operator + predicate: + expr: (((key = '2') or (key = '8')) or (key = '3')) + type: boolean + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 1 + value expressions: + expr: key + type: string + expr: val + type: string + Reduce Operator Tree: + Join Operator + condition map: + Outer Join 0 to 1 + condition expressions: + 0 {VALUE._col0} {VALUE._col1} + 1 {VALUE._col0} {VALUE._col1} + handleSkewJoin: false + outputColumnNames: _col0, _col1, _col4, _col5 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: string + expr: _col4 + type: string + expr: _col5 + type: string + outputColumnNames: _col0, _col1, _col2, _col3 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: SELECT a.*, b.* FROM T1 a FULL OUTER JOIN T2 b ON a.key = b.key +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +PREHOOK: Input: default@t2 +#### A masked pattern was here #### +POSTHOOK: query: SELECT a.*, b.* FROM T1 a FULL OUTER JOIN T2 b ON a.key = b.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +POSTHOOK: Input: default@t2 +#### A masked pattern was here #### +1 11 NULL NULL +NULL NULL 4 14 +NULL NULL 5 15 +7 17 NULL NULL +2 12 2 22 +3 13 3 13 +8 18 8 18 +8 18 8 18 +8 28 8 18 +8 28 8 18 Index: ql/src/test/results/clientpositive/skewjoinopt10.q.out =================================================================== --- ql/src/test/results/clientpositive/skewjoinopt10.q.out (revision 0) +++ ql/src/test/results/clientpositive/skewjoinopt10.q.out (working copy) @@ -0,0 +1,303 @@ +PREHOOK: query: CREATE TABLE T1(key STRING, value STRING) STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE T1(key STRING, value STRING) STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@T1 +PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1 +PREHOOK: type: LOAD +PREHOOK: Output: default@t1 +POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1 +POSTHOOK: type: LOAD +POSTHOOK: Output: default@t1 +PREHOOK: query: drop table array_valued_T1 +PREHOOK: type: DROPTABLE +POSTHOOK: query: drop table array_valued_T1 +POSTHOOK: type: DROPTABLE +PREHOOK: query: create table array_valued_T1 (key string, value array) SKEWED BY (key) ON ((8)) +PREHOOK: type: CREATETABLE +POSTHOOK: query: create table array_valued_T1 (key string, value array) SKEWED BY (key) ON ((8)) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@array_valued_T1 +PREHOOK: query: insert overwrite table array_valued_T1 select key, array(value) from T1 +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +PREHOOK: Output: default@array_valued_t1 +POSTHOOK: query: insert overwrite table array_valued_T1 select key, array(value) from T1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +POSTHOOK: Output: default@array_valued_t1 +POSTHOOK: Lineage: array_valued_t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: array_valued_t1.value EXPRESSION [(t1)t1.FieldSchema(name:value, type:string, comment:null), ] +PREHOOK: query: explain +select * from (select a.key as key, b.value as array_val from T1 a join array_valued_T1 b on a.key=b.key) i lateral view explode (array_val) c as val +PREHOOK: type: QUERY +POSTHOOK: query: explain +select * from (select a.key as key, b.value as array_val from T1 a join array_valued_T1 b on a.key=b.key) i lateral view explode (array_val) c as val +POSTHOOK: type: QUERY +POSTHOOK: Lineage: array_valued_t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: array_valued_t1.value EXPRESSION [(t1)t1.FieldSchema(name:value, type:string, comment:null), ] +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_LATERAL_VIEW (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTION explode (TOK_TABLE_OR_COL array_val)) val (TOK_TABALIAS c))) (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME T1) a) (TOK_TABREF (TOK_TABNAME array_valued_T1) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key) key) (TOK_SELEXPR (. (TOK_TABLE_OR_COL b) value) array_val)))) i))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1, Stage-3 + Stage-3 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + subquery1:a + TableScan + alias: a + Filter Operator + predicate: + expr: (not (key = '8')) + type: boolean + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 0 + value expressions: + expr: key + type: string + subquery1:b + TableScan + alias: b + Filter Operator + predicate: + expr: (not (key = '8')) + type: boolean + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 1 + value expressions: + expr: value + type: array + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {VALUE._col0} + 1 {VALUE._col1} + handleSkewJoin: false + outputColumnNames: _col0, _col5 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col5 + type: array + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + TableScan + Union + Select Operator + SELECT * : (no compute) + Lateral View Forward + Select Operator + SELECT * : (no compute) + Lateral View Join Operator + outputColumnNames: _col0, _col1, _col2 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: array + expr: _col2 + type: string + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + Select Operator + expressions: + expr: _col1 + type: array + outputColumnNames: _col0 + UDTF Operator + function name: explode + Lateral View Join Operator + outputColumnNames: _col0, _col1, _col2 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: array + expr: _col2 + type: string + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat +#### A masked pattern was here #### + TableScan + Union + Select Operator + SELECT * : (no compute) + Lateral View Forward + Select Operator + SELECT * : (no compute) + Lateral View Join Operator + outputColumnNames: _col0, _col1, _col2 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: array + expr: _col2 + type: string + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + Select Operator + expressions: + expr: _col1 + type: array + outputColumnNames: _col0 + UDTF Operator + function name: explode + Lateral View Join Operator + outputColumnNames: _col0, _col1, _col2 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: array + expr: _col2 + type: string + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-3 + Map Reduce + Alias -> Map Operator Tree: + i:a + TableScan + alias: a + Filter Operator + predicate: + expr: (key = '8') + type: boolean + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 0 + value expressions: + expr: key + type: string + i:b + TableScan + alias: b + Filter Operator + predicate: + expr: (key = '8') + type: boolean + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 1 + value expressions: + expr: value + type: array + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {VALUE._col0} + 1 {VALUE._col1} + handleSkewJoin: false + outputColumnNames: _col0, _col5 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col5 + type: array + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: select * from (select a.key as key, b.value as array_val from T1 a join array_valued_T1 b on a.key=b.key) i lateral view explode (array_val) c as val +PREHOOK: type: QUERY +PREHOOK: Input: default@array_valued_t1 +PREHOOK: Input: default@t1 +#### A masked pattern was here #### +POSTHOOK: query: select * from (select a.key as key, b.value as array_val from T1 a join array_valued_T1 b on a.key=b.key) i lateral view explode (array_val) c as val +POSTHOOK: type: QUERY +POSTHOOK: Input: default@array_valued_t1 +POSTHOOK: Input: default@t1 +#### A masked pattern was here #### +POSTHOOK: Lineage: array_valued_t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ] +POSTHOOK: Lineage: array_valued_t1.value EXPRESSION [(t1)t1.FieldSchema(name:value, type:string, comment:null), ] +1 ["11"] 11 +2 ["12"] 12 +3 ["13"] 13 +7 ["17"] 17 +8 ["18"] 18 +8 ["28"] 28 +8 ["18"] 18 +8 ["28"] 28 Index: ql/src/test/results/clientpositive/skewjoinopt5.q.out =================================================================== --- ql/src/test/results/clientpositive/skewjoinopt5.q.out (revision 0) +++ ql/src/test/results/clientpositive/skewjoinopt5.q.out (working copy) @@ -0,0 +1,233 @@ +PREHOOK: query: CREATE TABLE T1(key STRING, val STRING) +SKEWED BY (key, val) ON ((2, 12)) STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE T1(key STRING, val STRING) +SKEWED BY (key, val) ON ((2, 12)) STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@T1 +PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1 +PREHOOK: type: LOAD +PREHOOK: Output: default@t1 +POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1 +POSTHOOK: type: LOAD +POSTHOOK: Output: default@t1 +PREHOOK: query: CREATE TABLE T2(key STRING, val STRING) +SKEWED BY (key) ON ((3)) STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE T2(key STRING, val STRING) +SKEWED BY (key) ON ((3)) STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@T2 +PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE T2 +PREHOOK: type: LOAD +PREHOOK: Output: default@t2 +POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE T2 +POSTHOOK: type: LOAD +POSTHOOK: Output: default@t2 +PREHOOK: query: EXPLAIN +SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN +SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME T1) a) (TOK_TABREF (TOK_TABNAME T2) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_ALLCOLREF (TOK_TABNAME a))) (TOK_SELEXPR (TOK_ALLCOLREF (TOK_TABNAME b)))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1, Stage-4 + Stage-4 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + subquery1:a + TableScan + alias: a + Filter Operator + predicate: + expr: (not ((key = '2') or (key = '3'))) + type: boolean + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 0 + value expressions: + expr: key + type: string + expr: val + type: string + subquery1:b + TableScan + alias: b + Filter Operator + predicate: + expr: (not ((key = '2') or (key = '3'))) + type: boolean + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 1 + value expressions: + expr: key + type: string + expr: val + type: string + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {VALUE._col0} {VALUE._col1} + 1 {VALUE._col0} {VALUE._col1} + handleSkewJoin: false + outputColumnNames: _col0, _col1, _col4, _col5 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: string + expr: _col4 + type: string + expr: _col5 + type: string + outputColumnNames: _col0, _col1, _col2, _col3 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + TableScan + Union + Select Operator + SELECT * : (no compute) + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat +#### A masked pattern was here #### + TableScan + Union + Select Operator + SELECT * : (no compute) + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-4 + Map Reduce + Alias -> Map Operator Tree: + a + TableScan + alias: a + Filter Operator + predicate: + expr: ((key = '2') or (key = '3')) + type: boolean + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 0 + value expressions: + expr: key + type: string + expr: val + type: string + b + TableScan + alias: b + Filter Operator + predicate: + expr: ((key = '2') or (key = '3')) + type: boolean + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 1 + value expressions: + expr: key + type: string + expr: val + type: string + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {VALUE._col0} {VALUE._col1} + 1 {VALUE._col0} {VALUE._col1} + handleSkewJoin: false + outputColumnNames: _col0, _col1, _col4, _col5 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: string + expr: _col4 + type: string + expr: _col5 + type: string + outputColumnNames: _col0, _col1, _col2, _col3 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +PREHOOK: Input: default@t2 +#### A masked pattern was here #### +POSTHOOK: query: SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +POSTHOOK: Input: default@t2 +#### A masked pattern was here #### +8 18 8 18 +8 18 8 18 +8 28 8 18 +8 28 8 18 +2 12 2 22 +3 13 3 13 Index: ql/src/test/results/clientpositive/skewjoinopt12.q.out =================================================================== --- ql/src/test/results/clientpositive/skewjoinopt12.q.out (revision 0) +++ ql/src/test/results/clientpositive/skewjoinopt12.q.out (working copy) @@ -0,0 +1,279 @@ +PREHOOK: query: CREATE TABLE T1(key STRING, val STRING) +SKEWED BY (key) ON ((2)) STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE T1(key STRING, val STRING) +SKEWED BY (key) ON ((2)) STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@T1 +PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1 +PREHOOK: type: LOAD +PREHOOK: Output: default@t1 +POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1 +POSTHOOK: type: LOAD +POSTHOOK: Output: default@t1 +PREHOOK: query: CREATE TABLE T2(key STRING, val STRING) STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE T2(key STRING, val STRING) STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@T2 +PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE T2 +PREHOOK: type: LOAD +PREHOOK: Output: default@t2 +POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE T2 +POSTHOOK: type: LOAD +POSTHOOK: Output: default@t2 +PREHOOK: query: -- skew join in a sub-query followed by group by +EXPLAIN +select key, count(1) from +( + select a.key as key, a.val as val1, b.val as val2 from T1 a join T2 b on a.key = b.key +) s group by key +PREHOOK: type: QUERY +POSTHOOK: query: -- skew join in a sub-query followed by group by +EXPLAIN +select key, count(1) from +( + select a.key as key, a.val as val1, b.val as val2 from T1 a join T2 b on a.key = b.key +) s group by key +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME T1) a) (TOK_TABREF (TOK_TABNAME T2) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key) key) (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) val) val1) (TOK_SELEXPR (. (TOK_TABLE_OR_COL b) val) val2)))) s)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (TOK_TABLE_OR_COL key)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1, Stage-4 + Stage-4 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + subquery1:a + TableScan + alias: a + Filter Operator + predicate: + expr: (not (key = '2')) + type: boolean + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 0 + value expressions: + expr: key + type: string + subquery1:b + TableScan + alias: b + Filter Operator + predicate: + expr: (not (key = '2')) + type: boolean + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 1 + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {VALUE._col0} + 1 + handleSkewJoin: false + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: string + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + TableScan + Union + Select Operator + SELECT * : (no compute) + Select Operator + expressions: + expr: _col0 + type: string + outputColumnNames: _col0 + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: _col0 + type: string + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: -1 + value expressions: + expr: _col1 + type: bigint +#### A masked pattern was here #### + TableScan + Union + Select Operator + SELECT * : (no compute) + Select Operator + expressions: + expr: _col0 + type: string + outputColumnNames: _col0 + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: _col0 + type: string + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: -1 + value expressions: + expr: _col1 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: string + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-4 + Map Reduce + Alias -> Map Operator Tree: + s:a + TableScan + alias: a + Filter Operator + predicate: + expr: (key = '2') + type: boolean + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 0 + value expressions: + expr: key + type: string + s:b + TableScan + alias: b + Filter Operator + predicate: + expr: (key = '2') + type: boolean + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 1 + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {VALUE._col0} + 1 + handleSkewJoin: false + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: string + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: select key, count(1) from +( + select a.key as key, a.val as val1, b.val as val2 from T1 a join T2 b on a.key = b.key +) s group by key +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +PREHOOK: Input: default@t2 +#### A masked pattern was here #### +POSTHOOK: query: select key, count(1) from +( + select a.key as key, a.val as val1, b.val as val2 from T1 a join T2 b on a.key = b.key +) s group by key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +POSTHOOK: Input: default@t2 +#### A masked pattern was here #### +2 1 +3 1 +8 4 Index: ql/src/test/results/clientpositive/skewjoinopt7.q.out =================================================================== --- ql/src/test/results/clientpositive/skewjoinopt7.q.out (revision 0) +++ ql/src/test/results/clientpositive/skewjoinopt7.q.out (working copy) @@ -0,0 +1,295 @@ +PREHOOK: query: CREATE TABLE T1(key STRING, val STRING) +SKEWED BY (key) ON ((2), (8)) STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE T1(key STRING, val STRING) +SKEWED BY (key) ON ((2), (8)) STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@T1 +PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1 +PREHOOK: type: LOAD +PREHOOK: Output: default@t1 +POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1 +POSTHOOK: type: LOAD +POSTHOOK: Output: default@t1 +PREHOOK: query: CREATE TABLE T2(key STRING, val STRING) +SKEWED BY (key) ON ((3), (8)) STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE T2(key STRING, val STRING) +SKEWED BY (key) ON ((3), (8)) STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@T2 +PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE T2 +PREHOOK: type: LOAD +PREHOOK: Output: default@t2 +POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE T2 +POSTHOOK: type: LOAD +POSTHOOK: Output: default@t2 +PREHOOK: query: CREATE TABLE T3(key STRING, val STRING) STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE T3(key STRING, val STRING) STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@T3 +PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T3.txt' INTO TABLE T3 +PREHOOK: type: LOAD +PREHOOK: Output: default@t3 +POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T3.txt' INTO TABLE T3 +POSTHOOK: type: LOAD +POSTHOOK: Output: default@t3 +PREHOOK: query: EXPLAIN +SELECT a.*, b.*, c.* FROM T1 a JOIN T2 b ON a.key = b.key JOIN T3 c on a.key = c.key +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN +SELECT a.*, b.*, c.* FROM T1 a JOIN T2 b ON a.key = b.key JOIN T3 c on a.key = c.key +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_JOIN (TOK_TABREF (TOK_TABNAME T1) a) (TOK_TABREF (TOK_TABNAME T2) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key))) (TOK_TABREF (TOK_TABNAME T3) c) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL c) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_ALLCOLREF (TOK_TABNAME a))) (TOK_SELEXPR (TOK_ALLCOLREF (TOK_TABNAME b))) (TOK_SELEXPR (TOK_ALLCOLREF (TOK_TABNAME c)))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1, Stage-5 + Stage-5 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + subquery1:a + TableScan + alias: a + Filter Operator + predicate: + expr: (not (((key = '2') or (key = '8')) or (key = '3'))) + type: boolean + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 0 + value expressions: + expr: key + type: string + expr: val + type: string + subquery1:b + TableScan + alias: b + Filter Operator + predicate: + expr: (not (((key = '2') or (key = '8')) or (key = '3'))) + type: boolean + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 1 + value expressions: + expr: key + type: string + expr: val + type: string + subquery1:c + TableScan + alias: c + Filter Operator + predicate: + expr: (not (((key = '2') or (key = '8')) or (key = '3'))) + type: boolean + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 2 + value expressions: + expr: key + type: string + expr: val + type: string + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + Inner Join 0 to 2 + condition expressions: + 0 {VALUE._col0} {VALUE._col1} + 1 {VALUE._col0} {VALUE._col1} + 2 {VALUE._col0} {VALUE._col1} + handleSkewJoin: false + outputColumnNames: _col0, _col1, _col4, _col5, _col8, _col9 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: string + expr: _col4 + type: string + expr: _col5 + type: string + expr: _col8 + type: string + expr: _col9 + type: string + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + TableScan + Union + Select Operator + SELECT * : (no compute) + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat +#### A masked pattern was here #### + TableScan + Union + Select Operator + SELECT * : (no compute) + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-5 + Map Reduce + Alias -> Map Operator Tree: + a + TableScan + alias: a + Filter Operator + predicate: + expr: (((key = '2') or (key = '8')) or (key = '3')) + type: boolean + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 0 + value expressions: + expr: key + type: string + expr: val + type: string + b + TableScan + alias: b + Filter Operator + predicate: + expr: (((key = '2') or (key = '8')) or (key = '3')) + type: boolean + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 1 + value expressions: + expr: key + type: string + expr: val + type: string + c + TableScan + alias: c + Filter Operator + predicate: + expr: (((key = '2') or (key = '8')) or (key = '3')) + type: boolean + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 2 + value expressions: + expr: key + type: string + expr: val + type: string + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + Inner Join 0 to 2 + condition expressions: + 0 {VALUE._col0} {VALUE._col1} + 1 {VALUE._col0} {VALUE._col1} + 2 {VALUE._col0} {VALUE._col1} + handleSkewJoin: false + outputColumnNames: _col0, _col1, _col4, _col5, _col8, _col9 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: string + expr: _col4 + type: string + expr: _col5 + type: string + expr: _col8 + type: string + expr: _col9 + type: string + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: SELECT a.*, b.*, c.* FROM T1 a JOIN T2 b ON a.key = b.key JOIN T3 c on a.key = c.key +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +PREHOOK: Input: default@t2 +PREHOOK: Input: default@t3 +#### A masked pattern was here #### +POSTHOOK: query: SELECT a.*, b.*, c.* FROM T1 a JOIN T2 b ON a.key = b.key JOIN T3 c on a.key = c.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +POSTHOOK: Input: default@t2 +POSTHOOK: Input: default@t3 +#### A masked pattern was here #### +2 12 2 22 2 12 Index: ql/src/test/results/clientpositive/skewjoinopt2.q.out =================================================================== --- ql/src/test/results/clientpositive/skewjoinopt2.q.out (revision 0) +++ ql/src/test/results/clientpositive/skewjoinopt2.q.out (working copy) @@ -0,0 +1,914 @@ +PREHOOK: query: CREATE TABLE T1(key STRING, val STRING) +SKEWED BY (key) ON ((2), (7)) STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE T1(key STRING, val STRING) +SKEWED BY (key) ON ((2), (7)) STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@T1 +PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1 +PREHOOK: type: LOAD +PREHOOK: Output: default@t1 +POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1 +POSTHOOK: type: LOAD +POSTHOOK: Output: default@t1 +PREHOOK: query: CREATE TABLE T2(key STRING, val STRING) +SKEWED BY (key) ON ((3), (8)) STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE T2(key STRING, val STRING) +SKEWED BY (key) ON ((3), (8)) STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@T2 +PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE T2 +PREHOOK: type: LOAD +PREHOOK: Output: default@t2 +POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE T2 +POSTHOOK: type: LOAD +POSTHOOK: Output: default@t2 +PREHOOK: query: EXPLAIN +SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN +SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME T1) a) (TOK_TABREF (TOK_TABNAME T2) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_ALLCOLREF (TOK_TABNAME a))) (TOK_SELEXPR (TOK_ALLCOLREF (TOK_TABNAME b)))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1, Stage-4 + Stage-4 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + subquery1:a + TableScan + alias: a + Filter Operator + predicate: + expr: (not ((((key = '2') or (key = '7')) or (key = '3')) or (key = '8'))) + type: boolean + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 0 + value expressions: + expr: key + type: string + expr: val + type: string + subquery1:b + TableScan + alias: b + Filter Operator + predicate: + expr: (not ((((key = '2') or (key = '7')) or (key = '3')) or (key = '8'))) + type: boolean + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 1 + value expressions: + expr: key + type: string + expr: val + type: string + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {VALUE._col0} {VALUE._col1} + 1 {VALUE._col0} {VALUE._col1} + handleSkewJoin: false + outputColumnNames: _col0, _col1, _col4, _col5 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: string + expr: _col4 + type: string + expr: _col5 + type: string + outputColumnNames: _col0, _col1, _col2, _col3 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + TableScan + Union + Select Operator + SELECT * : (no compute) + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat +#### A masked pattern was here #### + TableScan + Union + Select Operator + SELECT * : (no compute) + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-4 + Map Reduce + Alias -> Map Operator Tree: + a + TableScan + alias: a + Filter Operator + predicate: + expr: ((((key = '2') or (key = '7')) or (key = '3')) or (key = '8')) + type: boolean + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 0 + value expressions: + expr: key + type: string + expr: val + type: string + b + TableScan + alias: b + Filter Operator + predicate: + expr: ((((key = '2') or (key = '7')) or (key = '3')) or (key = '8')) + type: boolean + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 1 + value expressions: + expr: key + type: string + expr: val + type: string + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {VALUE._col0} {VALUE._col1} + 1 {VALUE._col0} {VALUE._col1} + handleSkewJoin: false + outputColumnNames: _col0, _col1, _col4, _col5 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: string + expr: _col4 + type: string + expr: _col5 + type: string + outputColumnNames: _col0, _col1, _col2, _col3 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +PREHOOK: Input: default@t2 +#### A masked pattern was here #### +POSTHOOK: query: SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +POSTHOOK: Input: default@t2 +#### A masked pattern was here #### +2 12 2 22 +3 13 3 13 +8 18 8 18 +8 18 8 18 +8 28 8 18 +8 28 8 18 +PREHOOK: query: -- test outer joins also + +EXPLAIN +SELECT a.*, b.* FROM T1 a LEFT OUTER JOIN T2 b ON a.key = b.key +PREHOOK: type: QUERY +POSTHOOK: query: -- test outer joins also + +EXPLAIN +SELECT a.*, b.* FROM T1 a LEFT OUTER JOIN T2 b ON a.key = b.key +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_LEFTOUTERJOIN (TOK_TABREF (TOK_TABNAME T1) a) (TOK_TABREF (TOK_TABNAME T2) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_ALLCOLREF (TOK_TABNAME a))) (TOK_SELEXPR (TOK_ALLCOLREF (TOK_TABNAME b)))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1, Stage-4 + Stage-4 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + subquery1:a + TableScan + alias: a + Filter Operator + predicate: + expr: (not ((((key = '2') or (key = '7')) or (key = '3')) or (key = '8'))) + type: boolean + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 0 + value expressions: + expr: key + type: string + expr: val + type: string + subquery1:b + TableScan + alias: b + Filter Operator + predicate: + expr: (not ((((key = '2') or (key = '7')) or (key = '3')) or (key = '8'))) + type: boolean + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 1 + value expressions: + expr: key + type: string + expr: val + type: string + Reduce Operator Tree: + Join Operator + condition map: + Left Outer Join0 to 1 + condition expressions: + 0 {VALUE._col0} {VALUE._col1} + 1 {VALUE._col0} {VALUE._col1} + handleSkewJoin: false + outputColumnNames: _col0, _col1, _col4, _col5 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: string + expr: _col4 + type: string + expr: _col5 + type: string + outputColumnNames: _col0, _col1, _col2, _col3 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + TableScan + Union + Select Operator + SELECT * : (no compute) + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat +#### A masked pattern was here #### + TableScan + Union + Select Operator + SELECT * : (no compute) + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-4 + Map Reduce + Alias -> Map Operator Tree: + a + TableScan + alias: a + Filter Operator + predicate: + expr: ((((key = '2') or (key = '7')) or (key = '3')) or (key = '8')) + type: boolean + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 0 + value expressions: + expr: key + type: string + expr: val + type: string + b + TableScan + alias: b + Filter Operator + predicate: + expr: ((((key = '2') or (key = '7')) or (key = '3')) or (key = '8')) + type: boolean + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 1 + value expressions: + expr: key + type: string + expr: val + type: string + Reduce Operator Tree: + Join Operator + condition map: + Left Outer Join0 to 1 + condition expressions: + 0 {VALUE._col0} {VALUE._col1} + 1 {VALUE._col0} {VALUE._col1} + handleSkewJoin: false + outputColumnNames: _col0, _col1, _col4, _col5 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: string + expr: _col4 + type: string + expr: _col5 + type: string + outputColumnNames: _col0, _col1, _col2, _col3 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: SELECT a.*, b.* FROM T1 a LEFT OUTER JOIN T2 b ON a.key = b.key +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +PREHOOK: Input: default@t2 +#### A masked pattern was here #### +POSTHOOK: query: SELECT a.*, b.* FROM T1 a LEFT OUTER JOIN T2 b ON a.key = b.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +POSTHOOK: Input: default@t2 +#### A masked pattern was here #### +1 11 NULL NULL +2 12 2 22 +3 13 3 13 +7 17 NULL NULL +8 18 8 18 +8 18 8 18 +8 28 8 18 +8 28 8 18 +PREHOOK: query: -- a group by at the end should not change anything + +EXPLAIN +SELECT a.key, count(1) FROM T1 a JOIN T2 b ON a.key = b.key group by a.key +PREHOOK: type: QUERY +POSTHOOK: query: -- a group by at the end should not change anything + +EXPLAIN +SELECT a.key, count(1) FROM T1 a JOIN T2 b ON a.key = b.key group by a.key +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME T1) a) (TOK_TABREF (TOK_TABNAME T2) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (. (TOK_TABLE_OR_COL a) key)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1, Stage-4 + Stage-4 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + subquery1:a + TableScan + alias: a + Filter Operator + predicate: + expr: (not ((((key = '2') or (key = '7')) or (key = '3')) or (key = '8'))) + type: boolean + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 0 + value expressions: + expr: key + type: string + subquery1:b + TableScan + alias: b + Filter Operator + predicate: + expr: (not ((((key = '2') or (key = '7')) or (key = '3')) or (key = '8'))) + type: boolean + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 1 + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {VALUE._col0} + 1 + handleSkewJoin: false + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: string + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + TableScan + Union + Select Operator + SELECT * : (no compute) + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: _col0 + type: string + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: -1 + value expressions: + expr: _col1 + type: bigint +#### A masked pattern was here #### + TableScan + Union + Select Operator + SELECT * : (no compute) + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: _col0 + type: string + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: -1 + value expressions: + expr: _col1 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: string + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-4 + Map Reduce + Alias -> Map Operator Tree: + a + TableScan + alias: a + Filter Operator + predicate: + expr: ((((key = '2') or (key = '7')) or (key = '3')) or (key = '8')) + type: boolean + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 0 + value expressions: + expr: key + type: string + b + TableScan + alias: b + Filter Operator + predicate: + expr: ((((key = '2') or (key = '7')) or (key = '3')) or (key = '8')) + type: boolean + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 1 + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {VALUE._col0} + 1 + handleSkewJoin: false + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: string + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: SELECT a.key, count(1) FROM T1 a JOIN T2 b ON a.key = b.key group by a.key +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +PREHOOK: Input: default@t2 +#### A masked pattern was here #### +POSTHOOK: query: SELECT a.key, count(1) FROM T1 a JOIN T2 b ON a.key = b.key group by a.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +POSTHOOK: Input: default@t2 +#### A masked pattern was here #### +2 1 +3 1 +8 4 +PREHOOK: query: EXPLAIN +SELECT a.key, count(1) FROM T1 a LEFT OUTER JOIN T2 b ON a.key = b.key group by a.key +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN +SELECT a.key, count(1) FROM T1 a LEFT OUTER JOIN T2 b ON a.key = b.key group by a.key +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_LEFTOUTERJOIN (TOK_TABREF (TOK_TABNAME T1) a) (TOK_TABREF (TOK_TABNAME T2) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (. (TOK_TABLE_OR_COL a) key)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1, Stage-4 + Stage-4 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + subquery1:a + TableScan + alias: a + Filter Operator + predicate: + expr: (not ((((key = '2') or (key = '7')) or (key = '3')) or (key = '8'))) + type: boolean + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 0 + value expressions: + expr: key + type: string + subquery1:b + TableScan + alias: b + Filter Operator + predicate: + expr: (not ((((key = '2') or (key = '7')) or (key = '3')) or (key = '8'))) + type: boolean + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 1 + Reduce Operator Tree: + Join Operator + condition map: + Left Outer Join0 to 1 + condition expressions: + 0 {VALUE._col0} + 1 + handleSkewJoin: false + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: string + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + TableScan + Union + Select Operator + SELECT * : (no compute) + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: _col0 + type: string + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: -1 + value expressions: + expr: _col1 + type: bigint +#### A masked pattern was here #### + TableScan + Union + Select Operator + SELECT * : (no compute) + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: _col0 + type: string + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: -1 + value expressions: + expr: _col1 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: string + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-4 + Map Reduce + Alias -> Map Operator Tree: + a + TableScan + alias: a + Filter Operator + predicate: + expr: ((((key = '2') or (key = '7')) or (key = '3')) or (key = '8')) + type: boolean + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 0 + value expressions: + expr: key + type: string + b + TableScan + alias: b + Filter Operator + predicate: + expr: ((((key = '2') or (key = '7')) or (key = '3')) or (key = '8')) + type: boolean + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 1 + Reduce Operator Tree: + Join Operator + condition map: + Left Outer Join0 to 1 + condition expressions: + 0 {VALUE._col0} + 1 + handleSkewJoin: false + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: string + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: SELECT a.key, count(1) FROM T1 a LEFT OUTER JOIN T2 b ON a.key = b.key group by a.key +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +PREHOOK: Input: default@t2 +#### A masked pattern was here #### +POSTHOOK: query: SELECT a.key, count(1) FROM T1 a LEFT OUTER JOIN T2 b ON a.key = b.key group by a.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +POSTHOOK: Input: default@t2 +#### A masked pattern was here #### +1 1 +2 1 +3 1 +7 1 +8 4 Index: ql/src/test/results/clientpositive/skewjoinopt14.q.out =================================================================== --- ql/src/test/results/clientpositive/skewjoinopt14.q.out (revision 0) +++ ql/src/test/results/clientpositive/skewjoinopt14.q.out (working copy) @@ -0,0 +1,304 @@ +PREHOOK: query: CREATE TABLE T1(key STRING, val STRING) +SKEWED BY (key) ON ((2)) STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE T1(key STRING, val STRING) +SKEWED BY (key) ON ((2)) STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@T1 +PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1 +PREHOOK: type: LOAD +PREHOOK: Output: default@t1 +POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1 +POSTHOOK: type: LOAD +POSTHOOK: Output: default@t1 +PREHOOK: query: CREATE TABLE T2(key STRING, val STRING) STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE T2(key STRING, val STRING) STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@T2 +PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE T2 +PREHOOK: type: LOAD +PREHOOK: Output: default@t2 +POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE T2 +POSTHOOK: type: LOAD +POSTHOOK: Output: default@t2 +PREHOOK: query: CREATE TABLE T3(key STRING, val STRING) +SKEWED BY (val) ON ((12)) STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE T3(key STRING, val STRING) +SKEWED BY (val) ON ((12)) STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@T3 +PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T3.txt' INTO TABLE T3 +PREHOOK: type: LOAD +PREHOOK: Output: default@t3 +POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T3.txt' INTO TABLE T3 +POSTHOOK: type: LOAD +POSTHOOK: Output: default@t3 +PREHOOK: query: -- skew join in a sub-query followed by group by +EXPLAIN +select * +from +T1 a join T2 b on a.key = b.key +join T3 c on a.val = c.val +PREHOOK: type: QUERY +POSTHOOK: query: -- skew join in a sub-query followed by group by +EXPLAIN +select * +from +T1 a join T2 b on a.key = b.key +join T3 c on a.val = c.val +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_JOIN (TOK_TABREF (TOK_TABNAME T1) a) (TOK_TABREF (TOK_TABNAME T2) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key))) (TOK_TABREF (TOK_TABNAME T3) c) (= (. (TOK_TABLE_OR_COL a) val) (. (TOK_TABLE_OR_COL c) val)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1, Stage-4 + Stage-4 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + subquery1:a + TableScan + alias: a + Filter Operator + predicate: + expr: (not (key = '2')) + type: boolean + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 0 + value expressions: + expr: key + type: string + expr: val + type: string + subquery1:b + TableScan + alias: b + Filter Operator + predicate: + expr: (not (key = '2')) + type: boolean + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 1 + value expressions: + expr: key + type: string + expr: val + type: string + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {VALUE._col0} {VALUE._col1} + 1 {VALUE._col0} {VALUE._col1} + handleSkewJoin: false + outputColumnNames: _col0, _col1, _col4, _col5 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: + c + TableScan + alias: c + Reduce Output Operator + key expressions: + expr: val + type: string + sort order: + + Map-reduce partition columns: + expr: val + type: string + tag: 1 + value expressions: + expr: key + type: string + expr: val + type: string +#### A masked pattern was here #### + TableScan + Union + Select Operator + SELECT * : (no compute) + Reduce Output Operator + key expressions: + expr: _col1 + type: string + sort order: + + Map-reduce partition columns: + expr: _col1 + type: string + tag: 0 + value expressions: + expr: _col4 + type: string + expr: _col5 + type: string + expr: _col0 + type: string + expr: _col1 + type: string +#### A masked pattern was here #### + TableScan + Union + Select Operator + SELECT * : (no compute) + Reduce Output Operator + key expressions: + expr: _col1 + type: string + sort order: + + Map-reduce partition columns: + expr: _col1 + type: string + tag: 0 + value expressions: + expr: _col4 + type: string + expr: _col5 + type: string + expr: _col0 + type: string + expr: _col1 + type: string + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {VALUE._col0} {VALUE._col1} {VALUE._col4} {VALUE._col5} + 1 {VALUE._col0} {VALUE._col1} + handleSkewJoin: false + outputColumnNames: _col0, _col1, _col4, _col5, _col8, _col9 + Select Operator + expressions: + expr: _col4 + type: string + expr: _col5 + type: string + expr: _col0 + type: string + expr: _col1 + type: string + expr: _col8 + type: string + expr: _col9 + type: string + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-4 + Map Reduce + Alias -> Map Operator Tree: + a + TableScan + alias: a + Filter Operator + predicate: + expr: (key = '2') + type: boolean + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 0 + value expressions: + expr: key + type: string + expr: val + type: string + b + TableScan + alias: b + Filter Operator + predicate: + expr: (key = '2') + type: boolean + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 1 + value expressions: + expr: key + type: string + expr: val + type: string + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {VALUE._col0} {VALUE._col1} + 1 {VALUE._col0} {VALUE._col1} + handleSkewJoin: false + outputColumnNames: _col0, _col1, _col4, _col5 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: select * +from +T1 a join T2 b on a.key = b.key +join T3 c on a.val = c.val +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +PREHOOK: Input: default@t2 +PREHOOK: Input: default@t3 +#### A masked pattern was here #### +POSTHOOK: query: select * +from +T1 a join T2 b on a.key = b.key +join T3 c on a.val = c.val +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +POSTHOOK: Input: default@t2 +POSTHOOK: Input: default@t3 +#### A masked pattern was here #### +2 12 2 22 2 12 Index: ql/src/test/results/clientpositive/skewjoinopt9.q.out =================================================================== --- ql/src/test/results/clientpositive/skewjoinopt9.q.out (revision 0) +++ ql/src/test/results/clientpositive/skewjoinopt9.q.out (working copy) @@ -0,0 +1,358 @@ +PREHOOK: query: CREATE TABLE T1(key STRING, val STRING) +SKEWED BY (key) ON ((2)) STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE T1(key STRING, val STRING) +SKEWED BY (key) ON ((2)) STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@T1 +PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1 +PREHOOK: type: LOAD +PREHOOK: Output: default@t1 +POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1 +POSTHOOK: type: LOAD +POSTHOOK: Output: default@t1 +PREHOOK: query: CREATE TABLE T2(key STRING, val STRING) STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE T2(key STRING, val STRING) STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@T2 +PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE T2 +PREHOOK: type: LOAD +PREHOOK: Output: default@t2 +POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE T2 +POSTHOOK: type: LOAD +POSTHOOK: Output: default@t2 +PREHOOK: query: -- no skew optimization in case of a union in the sub-query +EXPLAIN +select * from +( +select key, val from T1 + union all +select key, val from T1 +) subq1 +join T2 b on subq1.key = b.key +PREHOOK: type: QUERY +POSTHOOK: query: -- no skew optimization in case of a union in the sub-query +EXPLAIN +select * from +( +select key, val from T1 + union all +select key, val from T1 +) subq1 +join T2 b on subq1.key = b.key +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_SUBQUERY (TOK_UNION (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL val))))) (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL val)))))) subq1) (TOK_TABREF (TOK_TABNAME T2) b) (= (. (TOK_TABLE_OR_COL subq1) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + b + TableScan + alias: b + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 1 + value expressions: + expr: key + type: string + expr: val + type: string + null-subquery1:subq1-subquery1:t1 + TableScan + alias: t1 + Select Operator + expressions: + expr: key + type: string + expr: val + type: string + outputColumnNames: _col0, _col1 + Union + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: 0 + value expressions: + expr: _col0 + type: string + expr: _col1 + type: string + null-subquery2:subq1-subquery2:t1 + TableScan + alias: t1 + Select Operator + expressions: + expr: key + type: string + expr: val + type: string + outputColumnNames: _col0, _col1 + Union + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: 0 + value expressions: + expr: _col0 + type: string + expr: _col1 + type: string + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {VALUE._col0} {VALUE._col1} + 1 {VALUE._col0} {VALUE._col1} + handleSkewJoin: false + outputColumnNames: _col0, _col1, _col2, _col3 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: string + expr: _col2 + type: string + expr: _col3 + type: string + outputColumnNames: _col0, _col1, _col2, _col3 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: select * from +( +select key, val from T1 + union all +select key, val from T1 +) subq1 +join T2 b on subq1.key = b.key +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +PREHOOK: Input: default@t2 +#### A masked pattern was here #### +POSTHOOK: query: select * from +( +select key, val from T1 + union all +select key, val from T1 +) subq1 +join T2 b on subq1.key = b.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +POSTHOOK: Input: default@t2 +#### A masked pattern was here #### +2 12 2 22 +2 12 2 22 +3 13 3 13 +3 13 3 13 +8 18 8 18 +8 18 8 18 +8 18 8 18 +8 18 8 18 +8 28 8 18 +8 28 8 18 +8 28 8 18 +8 28 8 18 +PREHOOK: query: -- no skew optimization in case of a group by in the sub-query +EXPLAIN +select * from +( +select key, count(1) as cnt from T1 group by key +) subq1 +join T2 b on subq1.key = b.key +PREHOOK: type: QUERY +POSTHOOK: query: -- no skew optimization in case of a group by in the sub-query +EXPLAIN +select * from +( +select key, count(1) as cnt from T1 group by key +) subq1 +join T2 b on subq1.key = b.key +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count 1) cnt)) (TOK_GROUPBY (TOK_TABLE_OR_COL key)))) subq1) (TOK_TABREF (TOK_TABNAME T2) b) (= (. (TOK_TABLE_OR_COL subq1) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + subq1:t1 + TableScan + alias: t1 + Select Operator + expressions: + expr: key + type: string + outputColumnNames: key + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + keys: + expr: key + type: string + mode: hash + outputColumnNames: _col0, _col1 + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: -1 + value expressions: + expr: _col1 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + keys: + expr: KEY._col0 + type: string + mode: mergepartial + outputColumnNames: _col0, _col1 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + outputColumnNames: _col0, _col1 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: + $INTNAME + Reduce Output Operator + key expressions: + expr: _col0 + type: string + sort order: + + Map-reduce partition columns: + expr: _col0 + type: string + tag: 0 + value expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + b + TableScan + alias: b + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 1 + value expressions: + expr: key + type: string + expr: val + type: string + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {VALUE._col0} {VALUE._col1} + 1 {VALUE._col0} {VALUE._col1} + handleSkewJoin: false + outputColumnNames: _col0, _col1, _col2, _col3 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: bigint + expr: _col2 + type: string + expr: _col3 + type: string + outputColumnNames: _col0, _col1, _col2, _col3 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: select * from +( +select key, count(1) as cnt from T1 group by key +) subq1 +join T2 b on subq1.key = b.key +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +PREHOOK: Input: default@t2 +#### A masked pattern was here #### +POSTHOOK: query: select * from +( +select key, count(1) as cnt from T1 group by key +) subq1 +join T2 b on subq1.key = b.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +POSTHOOK: Input: default@t2 +#### A masked pattern was here #### +2 1 2 22 +3 1 3 13 +8 2 8 18 +8 2 8 18 Index: ql/src/test/results/clientpositive/skewjoinopt4.q.out =================================================================== --- ql/src/test/results/clientpositive/skewjoinopt4.q.out (revision 0) +++ ql/src/test/results/clientpositive/skewjoinopt4.q.out (working copy) @@ -0,0 +1,440 @@ +PREHOOK: query: CREATE TABLE T1(key STRING, val STRING) +SKEWED BY (key) ON ((2)) STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE T1(key STRING, val STRING) +SKEWED BY (key) ON ((2)) STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@T1 +PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1 +PREHOOK: type: LOAD +PREHOOK: Output: default@t1 +POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1 +POSTHOOK: type: LOAD +POSTHOOK: Output: default@t1 +PREHOOK: query: CREATE TABLE T2(key STRING, val STRING) STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE T2(key STRING, val STRING) STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@T2 +PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE T2 +PREHOOK: type: LOAD +PREHOOK: Output: default@t2 +POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE T2 +POSTHOOK: type: LOAD +POSTHOOK: Output: default@t2 +PREHOOK: query: EXPLAIN +SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN +SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME T1) a) (TOK_TABREF (TOK_TABNAME T2) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_ALLCOLREF (TOK_TABNAME a))) (TOK_SELEXPR (TOK_ALLCOLREF (TOK_TABNAME b)))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1, Stage-4 + Stage-4 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + subquery1:a + TableScan + alias: a + Filter Operator + predicate: + expr: (not (key = '2')) + type: boolean + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 0 + value expressions: + expr: key + type: string + expr: val + type: string + subquery1:b + TableScan + alias: b + Filter Operator + predicate: + expr: (not (key = '2')) + type: boolean + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 1 + value expressions: + expr: key + type: string + expr: val + type: string + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {VALUE._col0} {VALUE._col1} + 1 {VALUE._col0} {VALUE._col1} + handleSkewJoin: false + outputColumnNames: _col0, _col1, _col4, _col5 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: string + expr: _col4 + type: string + expr: _col5 + type: string + outputColumnNames: _col0, _col1, _col2, _col3 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + TableScan + Union + Select Operator + SELECT * : (no compute) + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat +#### A masked pattern was here #### + TableScan + Union + Select Operator + SELECT * : (no compute) + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-4 + Map Reduce + Alias -> Map Operator Tree: + a + TableScan + alias: a + Filter Operator + predicate: + expr: (key = '2') + type: boolean + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 0 + value expressions: + expr: key + type: string + expr: val + type: string + b + TableScan + alias: b + Filter Operator + predicate: + expr: (key = '2') + type: boolean + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 1 + value expressions: + expr: key + type: string + expr: val + type: string + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {VALUE._col0} {VALUE._col1} + 1 {VALUE._col0} {VALUE._col1} + handleSkewJoin: false + outputColumnNames: _col0, _col1, _col4, _col5 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: string + expr: _col4 + type: string + expr: _col5 + type: string + outputColumnNames: _col0, _col1, _col2, _col3 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +PREHOOK: Input: default@t2 +#### A masked pattern was here #### +POSTHOOK: query: SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +POSTHOOK: Input: default@t2 +#### A masked pattern was here #### +3 13 3 13 +8 18 8 18 +8 18 8 18 +8 28 8 18 +8 28 8 18 +2 12 2 22 +PREHOOK: query: -- the order of the join should not matter, just confirming +EXPLAIN +SELECT a.*, b.* FROM T2 a JOIN T1 b ON a.key = b.key +PREHOOK: type: QUERY +POSTHOOK: query: -- the order of the join should not matter, just confirming +EXPLAIN +SELECT a.*, b.* FROM T2 a JOIN T1 b ON a.key = b.key +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME T2) a) (TOK_TABREF (TOK_TABNAME T1) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_ALLCOLREF (TOK_TABNAME a))) (TOK_SELEXPR (TOK_ALLCOLREF (TOK_TABNAME b)))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1, Stage-4 + Stage-4 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + subquery1:a + TableScan + alias: a + Filter Operator + predicate: + expr: (not (key = '2')) + type: boolean + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 0 + value expressions: + expr: key + type: string + expr: val + type: string + subquery1:b + TableScan + alias: b + Filter Operator + predicate: + expr: (not (key = '2')) + type: boolean + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 1 + value expressions: + expr: key + type: string + expr: val + type: string + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {VALUE._col0} {VALUE._col1} + 1 {VALUE._col0} {VALUE._col1} + handleSkewJoin: false + outputColumnNames: _col0, _col1, _col4, _col5 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: string + expr: _col4 + type: string + expr: _col5 + type: string + outputColumnNames: _col0, _col1, _col2, _col3 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + TableScan + Union + Select Operator + SELECT * : (no compute) + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat +#### A masked pattern was here #### + TableScan + Union + Select Operator + SELECT * : (no compute) + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-4 + Map Reduce + Alias -> Map Operator Tree: + a + TableScan + alias: a + Filter Operator + predicate: + expr: (key = '2') + type: boolean + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 0 + value expressions: + expr: key + type: string + expr: val + type: string + b + TableScan + alias: b + Filter Operator + predicate: + expr: (key = '2') + type: boolean + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 1 + value expressions: + expr: key + type: string + expr: val + type: string + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {VALUE._col0} {VALUE._col1} + 1 {VALUE._col0} {VALUE._col1} + handleSkewJoin: false + outputColumnNames: _col0, _col1, _col4, _col5 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: string + expr: _col4 + type: string + expr: _col5 + type: string + outputColumnNames: _col0, _col1, _col2, _col3 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: SELECT a.*, b.* FROM T2 a JOIN T1 b ON a.key = b.key +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +PREHOOK: Input: default@t2 +#### A masked pattern was here #### +POSTHOOK: query: SELECT a.*, b.* FROM T2 a JOIN T1 b ON a.key = b.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +POSTHOOK: Input: default@t2 +#### A masked pattern was here #### +3 13 3 13 +8 18 8 18 +8 18 8 28 +8 18 8 18 +8 18 8 28 +2 22 2 12 Index: ql/src/test/results/clientpositive/skewjoinopt11.q.out =================================================================== --- ql/src/test/results/clientpositive/skewjoinopt11.q.out (revision 0) +++ ql/src/test/results/clientpositive/skewjoinopt11.q.out (working copy) @@ -0,0 +1,457 @@ +PREHOOK: query: CREATE TABLE T1(key STRING, val STRING) +SKEWED BY (key) ON ((2)) STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE T1(key STRING, val STRING) +SKEWED BY (key) ON ((2)) STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@T1 +PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1 +PREHOOK: type: LOAD +PREHOOK: Output: default@t1 +POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1 +POSTHOOK: type: LOAD +POSTHOOK: Output: default@t1 +PREHOOK: query: CREATE TABLE T2(key STRING, val STRING) STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE T2(key STRING, val STRING) STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@T2 +PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE T2 +PREHOOK: type: LOAD +PREHOOK: Output: default@t2 +POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE T2 +POSTHOOK: type: LOAD +POSTHOOK: Output: default@t2 +PREHOOK: query: -- skew join followed by union +EXPLAIN +select * from +( + select a.key, a.val as val1, b.val as val2 from T1 a join T2 b on a.key = b.key + union all + select a.key, a.val as val1, b.val as val2 from T1 a join T2 b on a.key = b.key +) subq1 +PREHOOK: type: QUERY +POSTHOOK: query: -- skew join followed by union +EXPLAIN +select * from +( + select a.key, a.val as val1, b.val as val2 from T1 a join T2 b on a.key = b.key + union all + select a.key, a.val as val1, b.val as val2 from T1 a join T2 b on a.key = b.key +) subq1 +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_UNION (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME T1) a) (TOK_TABREF (TOK_TABNAME T2) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key)) (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) val) val1) (TOK_SELEXPR (. (TOK_TABLE_OR_COL b) val) val2)))) (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME T1) a) (TOK_TABREF (TOK_TABNAME T2) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key)) (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) val) val1) (TOK_SELEXPR (. (TOK_TABLE_OR_COL b) val) val2))))) subq1)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1, Stage-5 + Stage-3 depends on stages: Stage-2, Stage-8 + Stage-5 is a root stage + Stage-7 is a root stage + Stage-8 depends on stages: Stage-7, Stage-9 + Stage-9 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + subquery1:a + TableScan + alias: a + Filter Operator + predicate: + expr: (not (key = '2')) + type: boolean + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 0 + value expressions: + expr: key + type: string + expr: val + type: string + subquery1:b + TableScan + alias: b + Filter Operator + predicate: + expr: (not (key = '2')) + type: boolean + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 1 + value expressions: + expr: val + type: string + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {VALUE._col0} {VALUE._col1} + 1 {VALUE._col1} + handleSkewJoin: false + outputColumnNames: _col0, _col1, _col5 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: string + expr: _col5 + type: string + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + TableScan + Union + Select Operator + SELECT * : (no compute) + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat +#### A masked pattern was here #### + TableScan + Union + Select Operator + SELECT * : (no compute) + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-3 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + TableScan + Union + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: string + expr: _col2 + type: string + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat +#### A masked pattern was here #### + TableScan + Union + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: string + expr: _col2 + type: string + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-5 + Map Reduce + Alias -> Map Operator Tree: + null-subquery2:subq1-subquery2:a + TableScan + alias: a + Filter Operator + predicate: + expr: (key = '2') + type: boolean + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 0 + value expressions: + expr: key + type: string + expr: val + type: string + null-subquery2:subq1-subquery2:b + TableScan + alias: b + Filter Operator + predicate: + expr: (key = '2') + type: boolean + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 1 + value expressions: + expr: val + type: string + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {VALUE._col0} {VALUE._col1} + 1 {VALUE._col1} + handleSkewJoin: false + outputColumnNames: _col0, _col1, _col5 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: string + expr: _col5 + type: string + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-7 + Map Reduce + Alias -> Map Operator Tree: + null-subquery1:subq1-subquery1:a + TableScan + alias: a + Filter Operator + predicate: + expr: (key = '2') + type: boolean + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 0 + value expressions: + expr: key + type: string + expr: val + type: string + null-subquery1:subq1-subquery1:b + TableScan + alias: b + Filter Operator + predicate: + expr: (key = '2') + type: boolean + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 1 + value expressions: + expr: val + type: string + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {VALUE._col0} {VALUE._col1} + 1 {VALUE._col1} + handleSkewJoin: false + outputColumnNames: _col0, _col1, _col5 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: string + expr: _col5 + type: string + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-8 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + TableScan + Union + Select Operator + SELECT * : (no compute) + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat +#### A masked pattern was here #### + TableScan + Union + Select Operator + SELECT * : (no compute) + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-9 + Map Reduce + Alias -> Map Operator Tree: + subquery2:a + TableScan + alias: a + Filter Operator + predicate: + expr: (not (key = '2')) + type: boolean + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 0 + value expressions: + expr: key + type: string + expr: val + type: string + subquery2:b + TableScan + alias: b + Filter Operator + predicate: + expr: (not (key = '2')) + type: boolean + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 1 + value expressions: + expr: val + type: string + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {VALUE._col0} {VALUE._col1} + 1 {VALUE._col1} + handleSkewJoin: false + outputColumnNames: _col0, _col1, _col5 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: string + expr: _col5 + type: string + outputColumnNames: _col0, _col1, _col2 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: select * from +( + select a.key, a.val as val1, b.val as val2 from T1 a join T2 b on a.key = b.key + union all + select a.key, a.val as val1, b.val as val2 from T1 a join T2 b on a.key = b.key +) subq1 +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +PREHOOK: Input: default@t2 +#### A masked pattern was here #### +POSTHOOK: query: select * from +( + select a.key, a.val as val1, b.val as val2 from T1 a join T2 b on a.key = b.key + union all + select a.key, a.val as val1, b.val as val2 from T1 a join T2 b on a.key = b.key +) subq1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +POSTHOOK: Input: default@t2 +#### A masked pattern was here #### +3 13 13 +8 18 18 +8 18 18 +8 28 18 +8 28 18 +2 12 22 +2 12 22 +3 13 13 +8 18 18 +8 18 18 +8 28 18 +8 28 18 Index: ql/src/test/results/clientpositive/skewjoinopt6.q.out =================================================================== --- ql/src/test/results/clientpositive/skewjoinopt6.q.out (revision 0) +++ ql/src/test/results/clientpositive/skewjoinopt6.q.out (working copy) @@ -0,0 +1,233 @@ +PREHOOK: query: CREATE TABLE T1(key STRING, val STRING) +SKEWED BY (key, val) ON ((2, 12), (8, 18)) STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE T1(key STRING, val STRING) +SKEWED BY (key, val) ON ((2, 12), (8, 18)) STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@T1 +PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1 +PREHOOK: type: LOAD +PREHOOK: Output: default@t1 +POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1 +POSTHOOK: type: LOAD +POSTHOOK: Output: default@t1 +PREHOOK: query: CREATE TABLE T2(key STRING, val STRING) +SKEWED BY (key, val) ON ((3, 13), (8, 18)) STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE T2(key STRING, val STRING) +SKEWED BY (key, val) ON ((3, 13), (8, 18)) STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@T2 +PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE T2 +PREHOOK: type: LOAD +PREHOOK: Output: default@t2 +POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE T2 +POSTHOOK: type: LOAD +POSTHOOK: Output: default@t2 +PREHOOK: query: EXPLAIN +SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN +SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME T1) a) (TOK_TABREF (TOK_TABNAME T2) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_ALLCOLREF (TOK_TABNAME a))) (TOK_SELEXPR (TOK_ALLCOLREF (TOK_TABNAME b)))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1, Stage-4 + Stage-4 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + subquery1:a + TableScan + alias: a + Filter Operator + predicate: + expr: (not (((key = '2') or (key = '8')) or (key = '3'))) + type: boolean + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 0 + value expressions: + expr: key + type: string + expr: val + type: string + subquery1:b + TableScan + alias: b + Filter Operator + predicate: + expr: (not (((key = '2') or (key = '8')) or (key = '3'))) + type: boolean + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 1 + value expressions: + expr: key + type: string + expr: val + type: string + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {VALUE._col0} {VALUE._col1} + 1 {VALUE._col0} {VALUE._col1} + handleSkewJoin: false + outputColumnNames: _col0, _col1, _col4, _col5 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: string + expr: _col4 + type: string + expr: _col5 + type: string + outputColumnNames: _col0, _col1, _col2, _col3 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + TableScan + Union + Select Operator + SELECT * : (no compute) + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat +#### A masked pattern was here #### + TableScan + Union + Select Operator + SELECT * : (no compute) + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-4 + Map Reduce + Alias -> Map Operator Tree: + a + TableScan + alias: a + Filter Operator + predicate: + expr: (((key = '2') or (key = '8')) or (key = '3')) + type: boolean + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 0 + value expressions: + expr: key + type: string + expr: val + type: string + b + TableScan + alias: b + Filter Operator + predicate: + expr: (((key = '2') or (key = '8')) or (key = '3')) + type: boolean + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 1 + value expressions: + expr: key + type: string + expr: val + type: string + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {VALUE._col0} {VALUE._col1} + 1 {VALUE._col0} {VALUE._col1} + handleSkewJoin: false + outputColumnNames: _col0, _col1, _col4, _col5 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: string + expr: _col4 + type: string + expr: _col5 + type: string + outputColumnNames: _col0, _col1, _col2, _col3 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +PREHOOK: Input: default@t2 +#### A masked pattern was here #### +POSTHOOK: query: SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +POSTHOOK: Input: default@t2 +#### A masked pattern was here #### +2 12 2 22 +3 13 3 13 +8 18 8 18 +8 18 8 18 +8 28 8 18 +8 28 8 18 Index: ql/src/test/results/clientpositive/skewjoinopt1.q.out =================================================================== --- ql/src/test/results/clientpositive/skewjoinopt1.q.out (revision 0) +++ ql/src/test/results/clientpositive/skewjoinopt1.q.out (working copy) @@ -0,0 +1,830 @@ +PREHOOK: query: CREATE TABLE T1(key STRING, val STRING) +SKEWED BY (key) ON ((2)) STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE T1(key STRING, val STRING) +SKEWED BY (key) ON ((2)) STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@T1 +PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1 +PREHOOK: type: LOAD +PREHOOK: Output: default@t1 +POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1 +POSTHOOK: type: LOAD +POSTHOOK: Output: default@t1 +PREHOOK: query: CREATE TABLE T2(key STRING, val STRING) +SKEWED BY (key) ON ((3)) STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE T2(key STRING, val STRING) +SKEWED BY (key) ON ((3)) STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@T2 +PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE T2 +PREHOOK: type: LOAD +PREHOOK: Output: default@t2 +POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE T2 +POSTHOOK: type: LOAD +POSTHOOK: Output: default@t2 +PREHOOK: query: EXPLAIN +SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN +SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME T1) a) (TOK_TABREF (TOK_TABNAME T2) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_ALLCOLREF (TOK_TABNAME a))) (TOK_SELEXPR (TOK_ALLCOLREF (TOK_TABNAME b)))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1, Stage-4 + Stage-4 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + subquery1:a + TableScan + alias: a + Filter Operator + predicate: + expr: (not ((key = '2') or (key = '3'))) + type: boolean + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 0 + value expressions: + expr: key + type: string + expr: val + type: string + subquery1:b + TableScan + alias: b + Filter Operator + predicate: + expr: (not ((key = '2') or (key = '3'))) + type: boolean + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 1 + value expressions: + expr: key + type: string + expr: val + type: string + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {VALUE._col0} {VALUE._col1} + 1 {VALUE._col0} {VALUE._col1} + handleSkewJoin: false + outputColumnNames: _col0, _col1, _col4, _col5 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: string + expr: _col4 + type: string + expr: _col5 + type: string + outputColumnNames: _col0, _col1, _col2, _col3 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + TableScan + Union + Select Operator + SELECT * : (no compute) + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat +#### A masked pattern was here #### + TableScan + Union + Select Operator + SELECT * : (no compute) + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-4 + Map Reduce + Alias -> Map Operator Tree: + a + TableScan + alias: a + Filter Operator + predicate: + expr: ((key = '2') or (key = '3')) + type: boolean + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 0 + value expressions: + expr: key + type: string + expr: val + type: string + b + TableScan + alias: b + Filter Operator + predicate: + expr: ((key = '2') or (key = '3')) + type: boolean + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 1 + value expressions: + expr: key + type: string + expr: val + type: string + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {VALUE._col0} {VALUE._col1} + 1 {VALUE._col0} {VALUE._col1} + handleSkewJoin: false + outputColumnNames: _col0, _col1, _col4, _col5 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: string + expr: _col4 + type: string + expr: _col5 + type: string + outputColumnNames: _col0, _col1, _col2, _col3 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +PREHOOK: Input: default@t2 +#### A masked pattern was here #### +POSTHOOK: query: SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +POSTHOOK: Input: default@t2 +#### A masked pattern was here #### +8 18 8 18 +8 18 8 18 +8 28 8 18 +8 28 8 18 +2 12 2 22 +3 13 3 13 +PREHOOK: query: -- test outer joins also + +EXPLAIN +SELECT a.*, b.* FROM T1 a RIGHT OUTER JOIN T2 b ON a.key = b.key +PREHOOK: type: QUERY +POSTHOOK: query: -- test outer joins also + +EXPLAIN +SELECT a.*, b.* FROM T1 a RIGHT OUTER JOIN T2 b ON a.key = b.key +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_RIGHTOUTERJOIN (TOK_TABREF (TOK_TABNAME T1) a) (TOK_TABREF (TOK_TABNAME T2) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_ALLCOLREF (TOK_TABNAME a))) (TOK_SELEXPR (TOK_ALLCOLREF (TOK_TABNAME b)))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1, Stage-4 + Stage-4 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + subquery1:a + TableScan + alias: a + Filter Operator + predicate: + expr: (not ((key = '2') or (key = '3'))) + type: boolean + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 0 + value expressions: + expr: key + type: string + expr: val + type: string + subquery1:b + TableScan + alias: b + Filter Operator + predicate: + expr: (not ((key = '2') or (key = '3'))) + type: boolean + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 1 + value expressions: + expr: key + type: string + expr: val + type: string + Reduce Operator Tree: + Join Operator + condition map: + Right Outer Join0 to 1 + condition expressions: + 0 {VALUE._col0} {VALUE._col1} + 1 {VALUE._col0} {VALUE._col1} + handleSkewJoin: false + outputColumnNames: _col0, _col1, _col4, _col5 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: string + expr: _col4 + type: string + expr: _col5 + type: string + outputColumnNames: _col0, _col1, _col2, _col3 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + TableScan + Union + Select Operator + SELECT * : (no compute) + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat +#### A masked pattern was here #### + TableScan + Union + Select Operator + SELECT * : (no compute) + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-4 + Map Reduce + Alias -> Map Operator Tree: + a + TableScan + alias: a + Filter Operator + predicate: + expr: ((key = '2') or (key = '3')) + type: boolean + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 0 + value expressions: + expr: key + type: string + expr: val + type: string + b + TableScan + alias: b + Filter Operator + predicate: + expr: ((key = '2') or (key = '3')) + type: boolean + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 1 + value expressions: + expr: key + type: string + expr: val + type: string + Reduce Operator Tree: + Join Operator + condition map: + Right Outer Join0 to 1 + condition expressions: + 0 {VALUE._col0} {VALUE._col1} + 1 {VALUE._col0} {VALUE._col1} + handleSkewJoin: false + outputColumnNames: _col0, _col1, _col4, _col5 + Select Operator + expressions: + expr: _col0 + type: string + expr: _col1 + type: string + expr: _col4 + type: string + expr: _col5 + type: string + outputColumnNames: _col0, _col1, _col2, _col3 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: SELECT a.*, b.* FROM T1 a RIGHT OUTER JOIN T2 b ON a.key = b.key +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +PREHOOK: Input: default@t2 +#### A masked pattern was here #### +POSTHOOK: query: SELECT a.*, b.* FROM T1 a RIGHT OUTER JOIN T2 b ON a.key = b.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +POSTHOOK: Input: default@t2 +#### A masked pattern was here #### +NULL NULL 4 14 +NULL NULL 5 15 +8 18 8 18 +8 18 8 18 +8 28 8 18 +8 28 8 18 +2 12 2 22 +3 13 3 13 +PREHOOK: query: -- a group by at the end should not change anything + +EXPLAIN +SELECT count(1) FROM T1 a JOIN T2 b ON a.key = b.key +PREHOOK: type: QUERY +POSTHOOK: query: -- a group by at the end should not change anything + +EXPLAIN +SELECT count(1) FROM T1 a JOIN T2 b ON a.key = b.key +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME T1) a) (TOK_TABREF (TOK_TABNAME T2) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTION count 1))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1, Stage-4 + Stage-4 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + subquery1:a + TableScan + alias: a + Filter Operator + predicate: + expr: (not ((key = '2') or (key = '3'))) + type: boolean + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 0 + subquery1:b + TableScan + alias: b + Filter Operator + predicate: + expr: (not ((key = '2') or (key = '3'))) + type: boolean + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 1 + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + Select Operator + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + TableScan + Union + Select Operator + SELECT * : (no compute) + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: bigint +#### A masked pattern was here #### + TableScan + Union + Select Operator + SELECT * : (no compute) + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: bigint + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-4 + Map Reduce + Alias -> Map Operator Tree: + a + TableScan + alias: a + Filter Operator + predicate: + expr: ((key = '2') or (key = '3')) + type: boolean + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 0 + b + TableScan + alias: b + Filter Operator + predicate: + expr: ((key = '2') or (key = '3')) + type: boolean + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 1 + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + Select Operator + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: SELECT count(1) FROM T1 a JOIN T2 b ON a.key = b.key +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +PREHOOK: Input: default@t2 +#### A masked pattern was here #### +POSTHOOK: query: SELECT count(1) FROM T1 a JOIN T2 b ON a.key = b.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +POSTHOOK: Input: default@t2 +#### A masked pattern was here #### +6 +PREHOOK: query: EXPLAIN +SELECT count(1) FROM T1 a RIGHT OUTER JOIN T2 b ON a.key = b.key +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN +SELECT count(1) FROM T1 a RIGHT OUTER JOIN T2 b ON a.key = b.key +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_RIGHTOUTERJOIN (TOK_TABREF (TOK_TABNAME T1) a) (TOK_TABREF (TOK_TABNAME T2) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTION count 1))))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1, Stage-4 + Stage-4 is a root stage + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + subquery1:a + TableScan + alias: a + Filter Operator + predicate: + expr: (not ((key = '2') or (key = '3'))) + type: boolean + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 0 + subquery1:b + TableScan + alias: b + Filter Operator + predicate: + expr: (not ((key = '2') or (key = '3'))) + type: boolean + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 1 + Reduce Operator Tree: + Join Operator + condition map: + Right Outer Join0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + Select Operator + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: +#### A masked pattern was here #### + TableScan + Union + Select Operator + SELECT * : (no compute) + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: bigint +#### A masked pattern was here #### + TableScan + Union + Select Operator + SELECT * : (no compute) + Group By Operator + aggregations: + expr: count(1) + bucketGroup: false + mode: hash + outputColumnNames: _col0 + Reduce Output Operator + sort order: + tag: -1 + value expressions: + expr: _col0 + type: bigint + Reduce Operator Tree: + Group By Operator + aggregations: + expr: count(VALUE._col0) + bucketGroup: false + mode: mergepartial + outputColumnNames: _col0 + Select Operator + expressions: + expr: _col0 + type: bigint + outputColumnNames: _col0 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-4 + Map Reduce + Alias -> Map Operator Tree: + a + TableScan + alias: a + Filter Operator + predicate: + expr: ((key = '2') or (key = '3')) + type: boolean + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 0 + b + TableScan + alias: b + Filter Operator + predicate: + expr: ((key = '2') or (key = '3')) + type: boolean + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 1 + Reduce Operator Tree: + Join Operator + condition map: + Right Outer Join0 to 1 + condition expressions: + 0 + 1 + handleSkewJoin: false + Select Operator + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: SELECT count(1) FROM T1 a RIGHT OUTER JOIN T2 b ON a.key = b.key +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +PREHOOK: Input: default@t2 +#### A masked pattern was here #### +POSTHOOK: query: SELECT count(1) FROM T1 a RIGHT OUTER JOIN T2 b ON a.key = b.key +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +POSTHOOK: Input: default@t2 +#### A masked pattern was here #### +8 Index: ql/src/test/results/clientpositive/skewjoinopt13.q.out =================================================================== --- ql/src/test/results/clientpositive/skewjoinopt13.q.out (revision 0) +++ ql/src/test/results/clientpositive/skewjoinopt13.q.out (working copy) @@ -0,0 +1,205 @@ +PREHOOK: query: CREATE TABLE T1(key STRING, val STRING) STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE T1(key STRING, val STRING) STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@T1 +PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1 +PREHOOK: type: LOAD +PREHOOK: Output: default@t1 +POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1 +POSTHOOK: type: LOAD +POSTHOOK: Output: default@t1 +PREHOOK: query: CREATE TABLE T2(key STRING, val STRING) STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE T2(key STRING, val STRING) STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@T2 +PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE T2 +PREHOOK: type: LOAD +PREHOOK: Output: default@t2 +POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE T2 +POSTHOOK: type: LOAD +POSTHOOK: Output: default@t2 +PREHOOK: query: CREATE TABLE T3(key STRING, val STRING) +SKEWED BY (val) ON ((12)) STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE T3(key STRING, val STRING) +SKEWED BY (val) ON ((12)) STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@T3 +PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T3.txt' INTO TABLE T3 +PREHOOK: type: LOAD +PREHOOK: Output: default@t3 +POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T3.txt' INTO TABLE T3 +POSTHOOK: type: LOAD +POSTHOOK: Output: default@t3 +PREHOOK: query: -- skew join in a sub-query followed by group by +EXPLAIN +select * +from +T1 a join T2 b on a.key = b.key +join T3 c on a.val = c.val +PREHOOK: type: QUERY +POSTHOOK: query: -- skew join in a sub-query followed by group by +EXPLAIN +select * +from +T1 a join T2 b on a.key = b.key +join T3 c on a.val = c.val +POSTHOOK: type: QUERY +ABSTRACT SYNTAX TREE: + (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_JOIN (TOK_TABREF (TOK_TABNAME T1) a) (TOK_TABREF (TOK_TABNAME T2) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key))) (TOK_TABREF (TOK_TABNAME T3) c) (= (. (TOK_TABLE_OR_COL a) val) (. (TOK_TABLE_OR_COL c) val)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF)))) + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Alias -> Map Operator Tree: + a + TableScan + alias: a + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 0 + value expressions: + expr: key + type: string + expr: val + type: string + b + TableScan + alias: b + Reduce Output Operator + key expressions: + expr: key + type: string + sort order: + + Map-reduce partition columns: + expr: key + type: string + tag: 1 + value expressions: + expr: key + type: string + expr: val + type: string + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {VALUE._col0} {VALUE._col1} + 1 {VALUE._col0} {VALUE._col1} + handleSkewJoin: false + outputColumnNames: _col0, _col1, _col4, _col5 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + + Stage: Stage-2 + Map Reduce + Alias -> Map Operator Tree: + $INTNAME + Reduce Output Operator + key expressions: + expr: _col1 + type: string + sort order: + + Map-reduce partition columns: + expr: _col1 + type: string + tag: 0 + value expressions: + expr: _col4 + type: string + expr: _col5 + type: string + expr: _col0 + type: string + expr: _col1 + type: string + c + TableScan + alias: c + Reduce Output Operator + key expressions: + expr: val + type: string + sort order: + + Map-reduce partition columns: + expr: val + type: string + tag: 1 + value expressions: + expr: key + type: string + expr: val + type: string + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + condition expressions: + 0 {VALUE._col0} {VALUE._col1} {VALUE._col4} {VALUE._col5} + 1 {VALUE._col0} {VALUE._col1} + handleSkewJoin: false + outputColumnNames: _col0, _col1, _col4, _col5, _col8, _col9 + Select Operator + expressions: + expr: _col4 + type: string + expr: _col5 + type: string + expr: _col0 + type: string + expr: _col1 + type: string + expr: _col8 + type: string + expr: _col9 + type: string + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5 + File Output Operator + compressed: false + GlobalTableId: 0 + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + + Stage: Stage-0 + Fetch Operator + limit: -1 + + +PREHOOK: query: select * +from +T1 a join T2 b on a.key = b.key +join T3 c on a.val = c.val +PREHOOK: type: QUERY +PREHOOK: Input: default@t1 +PREHOOK: Input: default@t2 +PREHOOK: Input: default@t3 +#### A masked pattern was here #### +POSTHOOK: query: select * +from +T1 a join T2 b on a.key = b.key +join T3 c on a.val = c.val +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t1 +POSTHOOK: Input: default@t2 +POSTHOOK: Input: default@t3 +#### A masked pattern was here #### +2 12 2 22 2 12 Index: ql/src/test/queries/clientpositive/skewjoinopt1.q =================================================================== --- ql/src/test/queries/clientpositive/skewjoinopt1.q (revision 0) +++ ql/src/test/queries/clientpositive/skewjoinopt1.q (working copy) @@ -0,0 +1,36 @@ +set hive.internal.ddl.list.bucketing.enable=true; +set hive.optimize.skewjoinoptimization = true; + +CREATE TABLE T1(key STRING, val STRING) +SKEWED BY (key) ON ((2)) STORED AS TEXTFILE; + +LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1; + +CREATE TABLE T2(key STRING, val STRING) +SKEWED BY (key) ON ((3)) STORED AS TEXTFILE; + +LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE T2; + +EXPLAIN +SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key; + +SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key; + +-- test outer joins also + +EXPLAIN +SELECT a.*, b.* FROM T1 a RIGHT OUTER JOIN T2 b ON a.key = b.key; + +SELECT a.*, b.* FROM T1 a RIGHT OUTER JOIN T2 b ON a.key = b.key; + +-- a group by at the end should not change anything + +EXPLAIN +SELECT count(1) FROM T1 a JOIN T2 b ON a.key = b.key; + +SELECT count(1) FROM T1 a JOIN T2 b ON a.key = b.key; + +EXPLAIN +SELECT count(1) FROM T1 a RIGHT OUTER JOIN T2 b ON a.key = b.key; + +SELECT count(1) FROM T1 a RIGHT OUTER JOIN T2 b ON a.key = b.key; Index: ql/src/test/queries/clientpositive/skewjoinopt3.q =================================================================== --- ql/src/test/queries/clientpositive/skewjoinopt3.q (revision 0) +++ ql/src/test/queries/clientpositive/skewjoinopt3.q (working copy) @@ -0,0 +1,25 @@ +set hive.internal.ddl.list.bucketing.enable=true; +set hive.optimize.skewjoinoptimization = true; + +CREATE TABLE T1(key STRING, val STRING) +SKEWED BY (key) ON ((2), (8)) STORED AS TEXTFILE; + +LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1; + +CREATE TABLE T2(key STRING, val STRING) +SKEWED BY (key) ON ((3), (8)) STORED AS TEXTFILE; + +LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE T2; + +EXPLAIN +SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key; + +SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key; + + +-- test outer joins also + +EXPLAIN +SELECT a.*, b.* FROM T1 a FULL OUTER JOIN T2 b ON a.key = b.key; + +SELECT a.*, b.* FROM T1 a FULL OUTER JOIN T2 b ON a.key = b.key; Index: ql/src/test/queries/clientpositive/skewjoinopt5.q =================================================================== --- ql/src/test/queries/clientpositive/skewjoinopt5.q (revision 0) +++ ql/src/test/queries/clientpositive/skewjoinopt5.q (working copy) @@ -0,0 +1,17 @@ +set hive.internal.ddl.list.bucketing.enable=true; +set hive.optimize.skewjoinoptimization = true; + +CREATE TABLE T1(key STRING, val STRING) +SKEWED BY (key, val) ON ((2, 12)) STORED AS TEXTFILE; + +LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1; + +CREATE TABLE T2(key STRING, val STRING) +SKEWED BY (key) ON ((3)) STORED AS TEXTFILE; + +LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE T2; + +EXPLAIN +SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key; + +SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key; Index: ql/src/test/queries/clientpositive/skewjoinopt7.q =================================================================== --- ql/src/test/queries/clientpositive/skewjoinopt7.q (revision 0) +++ ql/src/test/queries/clientpositive/skewjoinopt7.q (working copy) @@ -0,0 +1,21 @@ +set hive.internal.ddl.list.bucketing.enable=true; +set hive.optimize.skewjoinoptimization = true; + +CREATE TABLE T1(key STRING, val STRING) +SKEWED BY (key) ON ((2), (8)) STORED AS TEXTFILE; + +LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1; + +CREATE TABLE T2(key STRING, val STRING) +SKEWED BY (key) ON ((3), (8)) STORED AS TEXTFILE; + +LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE T2; + +CREATE TABLE T3(key STRING, val STRING) STORED AS TEXTFILE; + +LOAD DATA LOCAL INPATH '../data/files/T3.txt' INTO TABLE T3; + +EXPLAIN +SELECT a.*, b.*, c.* FROM T1 a JOIN T2 b ON a.key = b.key JOIN T3 c on a.key = c.key; + +SELECT a.*, b.*, c.* FROM T1 a JOIN T2 b ON a.key = b.key JOIN T3 c on a.key = c.key; Index: ql/src/test/queries/clientpositive/skewjoinopt10.q =================================================================== --- ql/src/test/queries/clientpositive/skewjoinopt10.q (revision 0) +++ ql/src/test/queries/clientpositive/skewjoinopt10.q (working copy) @@ -0,0 +1,16 @@ +set hive.internal.ddl.list.bucketing.enable=true; +set hive.optimize.skewjoinoptimization = true; + +CREATE TABLE T1(key STRING, value STRING) STORED AS TEXTFILE; + +LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1; + +drop table array_valued_T1; +create table array_valued_T1 (key string, value array) SKEWED BY (key) ON ((8)); +insert overwrite table array_valued_T1 select key, array(value) from T1; + +explain +select * from (select a.key as key, b.value as array_val from T1 a join array_valued_T1 b on a.key=b.key) i lateral view explode (array_val) c as val; + +select * from (select a.key as key, b.value as array_val from T1 a join array_valued_T1 b on a.key=b.key) i lateral view explode (array_val) c as val; + Index: ql/src/test/queries/clientpositive/skewjoinopt9.q =================================================================== --- ql/src/test/queries/clientpositive/skewjoinopt9.q (revision 0) +++ ql/src/test/queries/clientpositive/skewjoinopt9.q (working copy) @@ -0,0 +1,44 @@ +set hive.internal.ddl.list.bucketing.enable=true; +set hive.optimize.skewjoinoptimization = true; + +CREATE TABLE T1(key STRING, val STRING) +SKEWED BY (key) ON ((2)) STORED AS TEXTFILE; + +LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1; + +CREATE TABLE T2(key STRING, val STRING) STORED AS TEXTFILE; + +LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE T2; + +-- no skew optimization in case of a union in the sub-query +EXPLAIN +select * from +( +select key, val from T1 + union all +select key, val from T1 +) subq1 +join T2 b on subq1.key = b.key; + +select * from +( +select key, val from T1 + union all +select key, val from T1 +) subq1 +join T2 b on subq1.key = b.key; + + +-- no skew optimization in case of a group by in the sub-query +EXPLAIN +select * from +( +select key, count(1) as cnt from T1 group by key +) subq1 +join T2 b on subq1.key = b.key; + +select * from +( +select key, count(1) as cnt from T1 group by key +) subq1 +join T2 b on subq1.key = b.key; Index: ql/src/test/queries/clientpositive/skewjoinopt12.q =================================================================== --- ql/src/test/queries/clientpositive/skewjoinopt12.q (revision 0) +++ ql/src/test/queries/clientpositive/skewjoinopt12.q (working copy) @@ -0,0 +1,23 @@ +set hive.internal.ddl.list.bucketing.enable=true; +set hive.optimize.skewjoinoptimization = true; + +CREATE TABLE T1(key STRING, val STRING) +SKEWED BY (key) ON ((2)) STORED AS TEXTFILE; + +LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1; + +CREATE TABLE T2(key STRING, val STRING) STORED AS TEXTFILE; + +LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE T2; + +-- skew join in a sub-query followed by group by +EXPLAIN +select key, count(1) from +( + select a.key as key, a.val as val1, b.val as val2 from T1 a join T2 b on a.key = b.key +) s group by key; + +select key, count(1) from +( + select a.key as key, a.val as val1, b.val as val2 from T1 a join T2 b on a.key = b.key +) s group by key; Index: ql/src/test/queries/clientpositive/skewjoinopt14.q =================================================================== --- ql/src/test/queries/clientpositive/skewjoinopt14.q (revision 0) +++ ql/src/test/queries/clientpositive/skewjoinopt14.q (working copy) @@ -0,0 +1,29 @@ +set hive.internal.ddl.list.bucketing.enable=true; +set hive.optimize.skewjoinoptimization = true; + +CREATE TABLE T1(key STRING, val STRING) +SKEWED BY (key) ON ((2)) STORED AS TEXTFILE; + +LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1; + +CREATE TABLE T2(key STRING, val STRING) STORED AS TEXTFILE; + +LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE T2; + +CREATE TABLE T3(key STRING, val STRING) +SKEWED BY (val) ON ((12)) STORED AS TEXTFILE; + +LOAD DATA LOCAL INPATH '../data/files/T3.txt' INTO TABLE T3; + +-- skew join in a sub-query followed by group by +EXPLAIN +select * +from +T1 a join T2 b on a.key = b.key +join T3 c on a.val = c.val; + +select * +from +T1 a join T2 b on a.key = b.key +join T3 c on a.val = c.val; + Index: ql/src/test/queries/clientpositive/skewjoinopt2.q =================================================================== --- ql/src/test/queries/clientpositive/skewjoinopt2.q (revision 0) +++ ql/src/test/queries/clientpositive/skewjoinopt2.q (working copy) @@ -0,0 +1,36 @@ +set hive.internal.ddl.list.bucketing.enable=true; +set hive.optimize.skewjoinoptimization = true; + +CREATE TABLE T1(key STRING, val STRING) +SKEWED BY (key) ON ((2), (7)) STORED AS TEXTFILE; + +LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1; + +CREATE TABLE T2(key STRING, val STRING) +SKEWED BY (key) ON ((3), (8)) STORED AS TEXTFILE; + +LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE T2; + +EXPLAIN +SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key; + +SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key; + +-- test outer joins also + +EXPLAIN +SELECT a.*, b.* FROM T1 a LEFT OUTER JOIN T2 b ON a.key = b.key; + +SELECT a.*, b.* FROM T1 a LEFT OUTER JOIN T2 b ON a.key = b.key; + +-- a group by at the end should not change anything + +EXPLAIN +SELECT a.key, count(1) FROM T1 a JOIN T2 b ON a.key = b.key group by a.key; + +SELECT a.key, count(1) FROM T1 a JOIN T2 b ON a.key = b.key group by a.key; + +EXPLAIN +SELECT a.key, count(1) FROM T1 a LEFT OUTER JOIN T2 b ON a.key = b.key group by a.key; + +SELECT a.key, count(1) FROM T1 a LEFT OUTER JOIN T2 b ON a.key = b.key group by a.key; Index: ql/src/test/queries/clientpositive/skewjoinopt4.q =================================================================== --- ql/src/test/queries/clientpositive/skewjoinopt4.q (revision 0) +++ ql/src/test/queries/clientpositive/skewjoinopt4.q (working copy) @@ -0,0 +1,22 @@ +set hive.internal.ddl.list.bucketing.enable=true; +set hive.optimize.skewjoinoptimization = true; + +CREATE TABLE T1(key STRING, val STRING) +SKEWED BY (key) ON ((2)) STORED AS TEXTFILE; + +LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1; + +CREATE TABLE T2(key STRING, val STRING) STORED AS TEXTFILE; + +LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE T2; + +EXPLAIN +SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key; + +SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key; + +-- the order of the join should not matter, just confirming +EXPLAIN +SELECT a.*, b.* FROM T2 a JOIN T1 b ON a.key = b.key; + +SELECT a.*, b.* FROM T2 a JOIN T1 b ON a.key = b.key; Index: ql/src/test/queries/clientpositive/skewjoinopt6.q =================================================================== --- ql/src/test/queries/clientpositive/skewjoinopt6.q (revision 0) +++ ql/src/test/queries/clientpositive/skewjoinopt6.q (working copy) @@ -0,0 +1,17 @@ +set hive.internal.ddl.list.bucketing.enable=true; +set hive.optimize.skewjoinoptimization = true; + +CREATE TABLE T1(key STRING, val STRING) +SKEWED BY (key, val) ON ((2, 12), (8, 18)) STORED AS TEXTFILE; + +LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1; + +CREATE TABLE T2(key STRING, val STRING) +SKEWED BY (key, val) ON ((3, 13), (8, 18)) STORED AS TEXTFILE; + +LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE T2; + +EXPLAIN +SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key; + +SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key; Index: ql/src/test/queries/clientpositive/skewjoinopt8.q =================================================================== --- ql/src/test/queries/clientpositive/skewjoinopt8.q (revision 0) +++ ql/src/test/queries/clientpositive/skewjoinopt8.q (working copy) @@ -0,0 +1,20 @@ +set hive.internal.ddl.list.bucketing.enable=true; +set hive.optimize.skewjoinoptimization = true; + +CREATE TABLE T1(key STRING, val STRING) STORED AS TEXTFILE; + +LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1; + +CREATE TABLE T2(key STRING, val STRING) +SKEWED BY (key) ON ((3), (8)) STORED AS TEXTFILE; + +LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE T2; + +CREATE TABLE T3(key STRING, val STRING) STORED AS TEXTFILE; + +LOAD DATA LOCAL INPATH '../data/files/T3.txt' INTO TABLE T3; + +EXPLAIN +SELECT a.*, b.*, c.* FROM T1 a JOIN T2 b ON a.key = b.key JOIN T3 c on a.key = c.key; + +SELECT a.*, b.*, c.* FROM T1 a JOIN T2 b ON a.key = b.key JOIN T3 c on a.key = c.key; Index: ql/src/test/queries/clientpositive/skewjoinopt11.q =================================================================== --- ql/src/test/queries/clientpositive/skewjoinopt11.q (revision 0) +++ ql/src/test/queries/clientpositive/skewjoinopt11.q (working copy) @@ -0,0 +1,27 @@ +set hive.internal.ddl.list.bucketing.enable=true; +set hive.optimize.skewjoinoptimization = true; + +CREATE TABLE T1(key STRING, val STRING) +SKEWED BY (key) ON ((2)) STORED AS TEXTFILE; + +LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1; + +CREATE TABLE T2(key STRING, val STRING) STORED AS TEXTFILE; + +LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE T2; + +-- skew join followed by union +EXPLAIN +select * from +( + select a.key, a.val as val1, b.val as val2 from T1 a join T2 b on a.key = b.key + union all + select a.key, a.val as val1, b.val as val2 from T1 a join T2 b on a.key = b.key +) subq1; + +select * from +( + select a.key, a.val as val1, b.val as val2 from T1 a join T2 b on a.key = b.key + union all + select a.key, a.val as val1, b.val as val2 from T1 a join T2 b on a.key = b.key +) subq1; Index: ql/src/test/queries/clientpositive/skewjoinopt13.q =================================================================== --- ql/src/test/queries/clientpositive/skewjoinopt13.q (revision 0) +++ ql/src/test/queries/clientpositive/skewjoinopt13.q (working copy) @@ -0,0 +1,28 @@ +set hive.internal.ddl.list.bucketing.enable=true; +set hive.optimize.skewjoinoptimization = true; + +CREATE TABLE T1(key STRING, val STRING) STORED AS TEXTFILE; + +LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1; + +CREATE TABLE T2(key STRING, val STRING) STORED AS TEXTFILE; + +LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE T2; + +CREATE TABLE T3(key STRING, val STRING) +SKEWED BY (val) ON ((12)) STORED AS TEXTFILE; + +LOAD DATA LOCAL INPATH '../data/files/T3.txt' INTO TABLE T3; + +-- skew join in a sub-query followed by group by +EXPLAIN +select * +from +T1 a join T2 b on a.key = b.key +join T3 c on a.val = c.val; + +select * +from +T1 a join T2 b on a.key = b.key +join T3 c on a.val = c.val; + Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/SkewJoinOptimizer.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/SkewJoinOptimizer.java (revision 0) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/SkewJoinOptimizer.java (working copy) @@ -0,0 +1,686 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.optimizer; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; +import java.util.Set; +import java.util.Stack; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hive.ql.exec.JoinOperator; +import org.apache.hadoop.hive.ql.exec.Operator; +import org.apache.hadoop.hive.ql.exec.OperatorFactory; +import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator; +import org.apache.hadoop.hive.ql.exec.RowSchema; +import org.apache.hadoop.hive.ql.exec.SelectOperator; +import org.apache.hadoop.hive.ql.exec.TableScanOperator; +import org.apache.hadoop.hive.ql.exec.UDFArgumentException; +import org.apache.hadoop.hive.ql.lib.DefaultGraphWalker; +import org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher; +import org.apache.hadoop.hive.ql.lib.Dispatcher; +import org.apache.hadoop.hive.ql.lib.GraphWalker; +import org.apache.hadoop.hive.ql.lib.Node; +import org.apache.hadoop.hive.ql.lib.NodeProcessor; +import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx; +import org.apache.hadoop.hive.ql.lib.Rule; +import org.apache.hadoop.hive.ql.lib.RuleRegExp; +import org.apache.hadoop.hive.ql.metadata.Table; +import org.apache.hadoop.hive.ql.parse.ParseContext; +import org.apache.hadoop.hive.ql.parse.RowResolver; +import org.apache.hadoop.hive.ql.parse.SemanticException; +import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc; +import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc; +import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; +import org.apache.hadoop.hive.ql.plan.ExprNodeDesc.ExprNodeDescEqualityWrapper; +import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc; +import org.apache.hadoop.hive.ql.plan.FilterDesc; +import org.apache.hadoop.hive.ql.plan.OperatorDesc; +import org.apache.hadoop.hive.ql.plan.ReduceSinkDesc; +import org.apache.hadoop.hive.ql.plan.SelectDesc; +import org.apache.hadoop.hive.ql.plan.UnionDesc; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPAnd; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqual; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPNot; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPOr; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorConverters; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorConverters.Converter; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; + +/** + * SkewJoinOptimizer. + * + */ +public class SkewJoinOptimizer implements Transform { + + private static final Log LOG = LogFactory.getLog(SkewJoinOptimizer.class.getName()); + private static ParseContext parseContext; + + public static class SkewJoinProc implements NodeProcessor { + public SkewJoinProc() { + super(); + } + + @Override + public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx, + Object... nodeOutputs) throws SemanticException { + // We should be having a tree which looks like this + // TS -> (FIL|SEL)* -> RS - + // \ + // -> JOIN -> SEL -> FS + // / + // TS -> (FIL|SEL)* -> RS - + // + // We are in the select operator now. + + SkewJoinOptProcCtx ctx = (SkewJoinOptProcCtx) procCtx; + parseContext = ctx.getpGraphContext(); + + JoinOperator joinOp = (JoinOperator)nd; + // This join has already been processed + if (ctx.getDoneJoins().contains(joinOp)) { + return null; + } + + ctx.getDoneJoins().add(joinOp); + + Operator currOp = joinOp; + boolean processSelect = false; + + // Is there a select following + // Clone the select also. It is useful for a follow-on optimization where the union + // followed by a select star is completely removed. + if ((joinOp.getChildOperators().size() == 1) && + (joinOp.getChildOperators().get(0) instanceof SelectOperator)) { + currOp = joinOp.getChildOperators().get(0); + processSelect = true; + } + + // Currently, we only support FILTERs and SELECTS to be present for the skew join + // optimization. This can be relaxed easily in the future + List tableScanOpsForJoin = new ArrayList(); + if (!getTableScanOpsForJoin(joinOp, tableScanOpsForJoin)) { + return null; + } + + if ((tableScanOpsForJoin == null) || (tableScanOpsForJoin.isEmpty())) { + return null; + } + + // Get the skewed values in all the tables + Map, List>> skewedValues = + getSkewedValues(joinOp, tableScanOpsForJoin); + + // If there are no skewed values, nothing needs to be done + if (skewedValues == null || skewedValues.size() == 0) { + return null; + } + + // After this optimization, the tree should be like: + // TS -> (FIL "skewed rows") (FIL|SEL)* -> RS - + // \ + // -> JOIN + // / \ + // TS -> (FIL "skewed rows") (FIL|SEL)* -> RS - \ + // \ + // -> UNION -> SEL -> FS + // / + // TS -> (FIL "no skewed rows") (FIL|SEL)* -> RS - / + // \ / + // -> JOIN + // / + // TS -> (FIL "no skewed rows") (FIL|SEL)* -> RS - + // + + // Create a clone of the operator + Operator currOpClone; + try { + currOpClone = currOp.clone(); + insertRowResolvers(currOp, currOpClone, ctx); + } catch (CloneNotSupportedException e) { + LOG.debug("Operator tree could not be cloned"); + return null; + } + + JoinOperator joinOpClone; + if (processSelect) { + joinOpClone = (JoinOperator)(currOpClone.getParentOperators().get(0)); + } else { + joinOpClone = (JoinOperator)currOpClone; + } + + // Put the filter "skewed column = skewed keys" in op + // and "skewed columns != skewed keys" in selectOpClone + insertSkewFilter(tableScanOpsForJoin, skewedValues, true); + + // Currently, we only support FILTERs and SELECTS to be present for the skew join + // optimization. This can be relaxed easily in the future + List tableScanCloneOpsForJoin = + new ArrayList(); + assert + getTableScanOpsForJoin(joinOpClone, tableScanCloneOpsForJoin); + + insertSkewFilter(tableScanCloneOpsForJoin, skewedValues, false); + + // Update the topOps appropriately + Map> topOps = getTopOps(joinOpClone); + Map> origTopOps = parseContext.getTopOps(); + + for (Entry> topOp : topOps.entrySet()) { + TableScanOperator tso = (TableScanOperator) topOp.getValue(); + Table origTable = parseContext.getTopToTable().get(ctx.getCloneTSOpMap().get(tso)); + String tabAlias = tso.getConf().getAlias(); + parseContext.getTopToTable().put(tso, origTable); + int initCnt = 1; + String newAlias = "subquery" + initCnt + ":" + tabAlias; + while (origTopOps.containsKey(newAlias)) { + initCnt++; + newAlias = "subquery" + initCnt + ":" + tabAlias; + } + + parseContext.getTopOps().put(newAlias, tso); + } + + // Now do a union of the select operators: selectOp and selectOpClone + // Store the operator that follows the select after the join, we will be + // adding this as a child to the Union later + List> finalOps = currOp.getChildOperators(); + currOp.setChildOperators(null); + currOpClone.setChildOperators(null); + + // Make the union operator + List> oplist = + new ArrayList>(); + oplist.add(currOp); + oplist.add(currOpClone); + Operator unionOp = + OperatorFactory.getAndMakeChild( + new UnionDesc(), new RowSchema(currOp.getSchema().getSignature()), oplist); + + RowResolver unionRR = parseContext.getOpParseCtx().get(currOp).getRowResolver(); + GenMapRedUtils.putOpInsertMap(unionOp, unionRR, parseContext); + + // Introduce a select after the union + List> unionList = + new ArrayList>(); + unionList.add(unionOp); + + Operator selectUnionOp = + OperatorFactory.getAndMakeChild( + new SelectDesc(true), + new RowSchema(unionOp.getSchema().getSignature()), unionList); + GenMapRedUtils.putOpInsertMap(selectUnionOp, unionRR, parseContext); + + // add the finalOp after the union + selectUnionOp.setChildOperators(finalOps); + // replace the original selectOp in the parents with selectUnionOp + for (Operator finalOp : finalOps) { + finalOp.replaceParent(currOp, selectUnionOp); + } + return null; + } + + /* + * Get the list of table scan operators for this join. A interface supportSkewJoinOptimization + * has been provided. Currently, it is only enabled for simple filters and selects. + */ + private boolean getTableScanOpsForJoin( + JoinOperator op, + List tsOps) { + + for (Operator parent : op.getParentOperators()) { + if (!getTableScanOps(parent, tsOps)) { + return false; + } + } + return true; + } + + private boolean getTableScanOps( + Operator op, + List tsOps) { + for (Operator parent : op.getParentOperators()) { + if (!parent.supportSkewJoinOptimization()) { + return false; + } + + if (parent instanceof TableScanOperator) { + tsOps.add((TableScanOperator)parent); + } else if (!getTableScanOps(parent, tsOps)) { + return false; + } + } + return true; + } + + /** + * Returns the skewed values in all the tables which are going to be scanned. + * If the join is on columns c1, c2 and c3 on tables T1 and T2, + * T1 is skewed on c1 and c4 with the skew values ((1,2),(3,4)), + * whereas T2 is skewed on c1, c2 with skew values ((5,6),(7,8)), the resulting + * map would be: <(c1) -> ((1), (3)), (c1,c2) -> ((5,6),(7,8))> + * @param op The select operator that follows the join + * @param skewedValues + * @param skewed True if we want skewedCol = skewedValue, false if we want + * skewedCol != skewedValue + * @param op The join operator + * @return map. + */ + private Map, List>> + getSkewedValues( + Operator op, List tableScanOpsForJoin) { + + Map , List>> skewDataReturn = + new HashMap, List>>(); + + Map , List>> skewData = + new HashMap, List>>(); + + // The join keys are available in the reduceSinkOperators before join + for (Operator reduceSinkOp : op.getParentOperators()) { + ReduceSinkDesc rsDesc = ((ReduceSinkOperator) reduceSinkOp).getConf(); + + if (rsDesc.getKeyCols() != null) { + Table table = null; + // Find the skew information corresponding to the table + List skewedColumns = null; + List> skewedValueList = null; + + // The join columns which are also skewed + List joinKeysSkewedCols = + new ArrayList(); + + // skewed Keys which intersect with join keys + List positionSkewedKeys = new ArrayList(); + + // Update the joinKeys appropriately. + for (ExprNodeDesc keyColDesc : rsDesc.getKeyCols()) { + ExprNodeColumnDesc keyCol = null; + + // If the key column is not a column, then we cannot apply this optimization. + if (keyColDesc instanceof ExprNodeColumnDesc) { + keyCol = (ExprNodeColumnDesc) keyColDesc; + if (table == null) { + table = getTable(parseContext, reduceSinkOp, tableScanOpsForJoin); + skewedColumns = + table == null ? null : table.getSkewedColNames(); + // No skew on the table to take care of + if ((skewedColumns == null) || (skewedColumns.isEmpty())) { + continue; + } + + skewedValueList = + table == null ? null : table.getSkewedColValues(); + } + int pos = skewedColumns.indexOf(keyCol.getColumn()); + if ((pos >= 0) && (!positionSkewedKeys.contains(pos))) { + positionSkewedKeys.add(pos); + ExprNodeColumnDesc keyColClone = (ExprNodeColumnDesc) keyCol.clone(); + keyColClone.setTabAlias(null); + joinKeysSkewedCols.add(new ExprNodeDescEqualityWrapper(keyColClone)); + } + } + } + + // If the skew keys match the join keys, then add it to the list + if ((skewedColumns != null) && (!skewedColumns.isEmpty())) { + if (!joinKeysSkewedCols.isEmpty()) { + // If the join keys matches the skewed keys, use the table skewed keys + List> skewedJoinValues; + if (skewedColumns.size() == positionSkewedKeys.size()) { + skewedJoinValues = skewedValueList; + } + else { + skewedJoinValues = + getSkewedJoinValues(skewedValueList, positionSkewedKeys); + } + + List> oldSkewedJoinValues = + skewData.get(joinKeysSkewedCols); + if (oldSkewedJoinValues == null) { + oldSkewedJoinValues = new ArrayList>(); + } + for (List skewValue : skewedJoinValues) { + if (!oldSkewedJoinValues.contains(skewValue)) { + oldSkewedJoinValues.add(skewValue); + } + } + + skewData.put(joinKeysSkewedCols, oldSkewedJoinValues); + } + } + } + } + + // convert skewData to contain ExprNodeDesc in the keys + for (Map.Entry, List>> mapEntry : + skewData.entrySet()) { + List skewedKeyJoinCols = new ArrayList(); + for (ExprNodeDescEqualityWrapper key : mapEntry.getKey()) { + skewedKeyJoinCols.add(key.getExprNodeDesc()); + } + skewDataReturn.put(skewedKeyJoinCols, mapEntry.getValue()); + } + + return skewDataReturn; + } + + /** + * Get the table alias from the candidate table scans. + */ + private Table getTable( + ParseContext parseContext, + Operator op, + List tableScanOpsForJoin) { + while (true) { + if (op instanceof TableScanOperator) { + TableScanOperator tsOp = (TableScanOperator)op; + if (tableScanOpsForJoin.contains(tsOp)) { + return parseContext.getTopToTable().get(tsOp); + } + } + if ((op.getParentOperators() == null) || (op.getParentOperators().size() > 1)) { + return null; + } + op = op.getParentOperators().get(0); + } + } + + /* + * If the skewedValues contains ((1,2,3),(4,5,6)), and the user is looking for + * positions (0,2), the result would be ((1,3),(4,6)) + * Get the skewed key values that are part of the join key. + * @param skewedValuesList List of all the skewed values + * @param positionSkewedKeys the requested positions + * @return sub-list of skewed values with the positions present + */ + private List> getSkewedJoinValues( + List> skewedValueList, List positionSkewedKeys) { + List> skewedJoinValues = new ArrayList>(); + for (List skewedValuesAllColumns : skewedValueList) { + List skewedValuesSpecifiedColumns = new ArrayList(); + for (int pos : positionSkewedKeys) { + skewedValuesSpecifiedColumns.add(skewedValuesAllColumns.get(pos)); + } + skewedJoinValues.add(skewedValuesSpecifiedColumns); + } + return skewedJoinValues; + } + + /** + * Inserts a filter comparing the join keys with the skewed keys. If the table + * is skewed with values (k1, v1) and (k2, v2) on columns (key, value), then + * filter ((key=k1 AND value=v1) OR (key=k2 AND value=v2)) is inserted. If @skewed + * is false, a NOT is inserted before it. + * @param op The select operator that follows the join + * @param skewedValues + * @param skewed True if we want skewedCol = skewedValue, false if we want + * skewedCol != skewedValue + */ + private void insertSkewFilter( + List tableScanOpsForJoin, + Map, List>> skewedValuesList, + boolean skewed) { + + ExprNodeDesc filterExpr = constructFilterExpr(skewedValuesList, skewed); + for (TableScanOperator tableScanOp : tableScanOpsForJoin) { + insertFilterOnTop(tableScanOp, filterExpr); + } + } + + /** + * Inserts a filter below the table scan operator. Construct the filter + * from the filter expression provided. + * @param tableScanOp the table scan operators + * @param filterExpr the filter expression + */ + private void insertFilterOnTop( + TableScanOperator tableScanOp, + ExprNodeDesc filterExpr) { + + // Get the top operator and it's child, all operators have a single parent + Operator currChild = tableScanOp.getChildOperators().get(0); + + // Create the filter Operator and update the parents and children appropriately + tableScanOp.setChildOperators(null); + currChild.setParentOperators(null); + + Operator filter = OperatorFactory.getAndMakeChild( + new FilterDesc(filterExpr, false), tableScanOp); + filter.setSchema(new RowSchema(tableScanOp.getSchema().getSignature())); + OperatorFactory.makeChild(filter, currChild); + + RowResolver filterRR = parseContext.getOpParseCtx().get(tableScanOp).getRowResolver(); + GenMapRedUtils.putOpInsertMap(filter, filterRR, parseContext); + } + + /** + * Construct the filter expression from the skewed keys and skewed values. + * If the skewed join keys are (k1), and (k1,k3) with the skewed values + * (1,2) and ((2,3),(4,5)) respectively, the filter expression would be: + * (k1=1) or (k1=2) or ((k1=2) and (k3=3)) or ((k1=4) and (k3=5)). + */ + private ExprNodeDesc constructFilterExpr( + Map, List>> skewedValuesMap, + boolean skewed) { + + ExprNodeDesc finalExprNodeDesc = null; + try { + for (Map.Entry, List>> mapEntry : + skewedValuesMap.entrySet()) { + List keyCols = mapEntry.getKey(); + List> skewedValuesList = mapEntry.getValue(); + + for (List skewedValues : skewedValuesList) { + int keyPos = 0; + ExprNodeDesc currExprNodeDesc = null; + + // Make the following condition: all the values match for all the columns + for (String skewedValue : skewedValues) { + List children = new ArrayList(); + + // We have ensured that the keys are columns + ExprNodeColumnDesc keyCol = (ExprNodeColumnDesc) keyCols.get(keyPos).clone(); + keyPos++; + children.add(keyCol); + + // Convert the constants available as strings to the corresponding objects + children.add(createConstDesc(skewedValue, keyCol)); + + ExprNodeGenericFuncDesc expr = null; + // Create the equality condition + expr = ExprNodeGenericFuncDesc.newInstance(new GenericUDFOPEqual(), children); + if (currExprNodeDesc == null) { + currExprNodeDesc = expr; + } else { + // If there are previous nodes, then AND the current node with the previous one + List childrenAND = new ArrayList(); + childrenAND.add(currExprNodeDesc); + childrenAND.add(expr); + currExprNodeDesc = + ExprNodeGenericFuncDesc.newInstance(new GenericUDFOPAnd(), childrenAND); + } + } + + // If there are more than one skewed values, + // then OR the current node with the previous one + if (finalExprNodeDesc == null) { + finalExprNodeDesc = currExprNodeDesc; + } else { + List childrenOR = new ArrayList(); + childrenOR.add(finalExprNodeDesc); + childrenOR.add(currExprNodeDesc); + + finalExprNodeDesc = + ExprNodeGenericFuncDesc.newInstance(new GenericUDFOPOr(), childrenOR); + } + } + } + + // Add a NOT operator in the beginning (this is for the cloned operator because we + // want the values which are not skewed + if (skewed == false) { + List childrenNOT = new ArrayList(); + childrenNOT.add(finalExprNodeDesc); + finalExprNodeDesc = + ExprNodeGenericFuncDesc.newInstance(new GenericUDFOPNot(), childrenNOT); + } + } catch (UDFArgumentException e) { + // Ignore the exception because we are not comparing Long vs. String here. + } + return finalExprNodeDesc; + } + + /** + * Converts the skewedValue available as a string in the metadata to the appropriate object + * by using the type of the column from the join key. + * @param skewedValue + * @param keyCol + * @return an expression node descriptor of the appropriate constant + */ + private ExprNodeConstantDesc createConstDesc( + String skewedValue, ExprNodeColumnDesc keyCol) { + ObjectInspector inputOI = TypeInfoUtils.getStandardJavaObjectInspectorFromTypeInfo( + TypeInfoFactory.stringTypeInfo); + ObjectInspector outputOI = TypeInfoUtils.getStandardJavaObjectInspectorFromTypeInfo( + keyCol.getTypeInfo()); + Converter converter = ObjectInspectorConverters.getConverter(inputOI, outputOI); + Object skewedValueObject = converter.convert(skewedValue); + return new ExprNodeConstantDesc(keyCol.getTypeInfo(), skewedValueObject); + } + + private Map> getTopOps( + Operator op) { + Map> topOps = + new HashMap>(); + if (op.getParentOperators() == null || op.getParentOperators().size() == 0) { + topOps.put(((TableScanOperator)op).getConf().getAlias(), op); + } else { + for (Operator parent : op.getParentOperators()) { + if (parent != null) { + topOps.putAll(getTopOps(parent)); + } + } + } + return topOps; + } + + private void insertRowResolvers( + Operator op, + Operator opClone, + SkewJoinOptProcCtx ctx) { + + if (op instanceof TableScanOperator) { + ctx.getCloneTSOpMap().put((TableScanOperator)opClone, (TableScanOperator)op); + } + + GenMapRedUtils.putOpInsertMap( + opClone, parseContext.getOpParseCtx().get(op).getRowResolver(), parseContext); + + List> parents = op.getParentOperators(); + List> parentClones = opClone.getParentOperators(); + if ((parents != null) && (!parents.isEmpty()) && + (parentClones != null) && (!parentClones.isEmpty())) { + for (int pos = 0; pos < parents.size(); pos++) { + insertRowResolvers(parents.get(pos), parentClones.get(pos), ctx); + } + } + } + } + + /* (non-Javadoc) + * @see org.apache.hadoop.hive.ql.optimizer.Transform#transform + * (org.apache.hadoop.hive.ql.parse.ParseContext) + */ + @Override + public ParseContext transform(ParseContext pctx) throws SemanticException { + Map opRules = new LinkedHashMap(); + + // Skew join only allows filters and selects currently. + opRules.put(new RuleRegExp("R1", "TS%.*RS%JOIN%"), getSkewJoinProc()); + LOG.debug("Inside transform of skewjoinoptimization"); + SkewJoinOptProcCtx skewJoinOptProcCtx = new SkewJoinOptProcCtx(pctx); + // The dispatcher fires the processor corresponding to the closest matching + // rule and passes the context along + Dispatcher disp = new DefaultRuleDispatcher( + null, opRules, skewJoinOptProcCtx); + GraphWalker ogw = new DefaultGraphWalker(disp); + + // Create a list of topop nodes + List topNodes = new ArrayList(); + topNodes.addAll(pctx.getTopOps().values()); + ogw.startWalking(topNodes, null); + return pctx; + } + + private NodeProcessor getSkewJoinProc() { + return new SkewJoinProc(); + } + + /** + * SkewJoinOptProcCtx. + * + */ + public static class SkewJoinOptProcCtx implements NodeProcessorCtx { + + private ParseContext pGraphContext; + + // set of joins already processed + private Set doneJoins; + private Map cloneTSOpMap; + + public SkewJoinOptProcCtx(ParseContext pctx) { + this.pGraphContext = pctx; + doneJoins = new HashSet(); + cloneTSOpMap = new HashMap(); + } + + public ParseContext getpGraphContext() { + return pGraphContext; + } + + public void setPGraphContext(ParseContext graphContext) { + pGraphContext = graphContext; + } + + public Set getDoneJoins() { + return doneJoins; + } + + public void setDoneJoins(Set doneJoins) { + this.doneJoins = doneJoins; + } + + public Map getCloneTSOpMap() { + return cloneTSOpMap; + } + + public void setCloneTSOpMap(Map cloneTSOpMap) { + this.cloneTSOpMap = cloneTSOpMap; + } + } +} Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java (revision 1379313) +++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java (working copy) @@ -19,6 +19,7 @@ package org.apache.hadoop.hive.ql.optimizer; import java.util.ArrayList; +import java.util.HashSet; import java.util.List; import org.apache.hadoop.hive.conf.HiveConf; @@ -55,6 +56,9 @@ transformations.add(new PartitionPruner()); transformations.add(new PartitionConditionRemover()); } + if (HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVEOPTSKEWJOIN)) { + transformations.add(new SkewJoinOptimizer()); + } if (HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVEOPTGBYUSINGINDEX)) { transformations.add(new RewriteGBUsingIndex()); } @@ -88,7 +92,7 @@ */ public ParseContext optimize() throws SemanticException { for (Transform t : transformations) { - pctx = t.transform(pctx); + pctx = t.transform(pctx); } return pctx; } Index: ql/src/java/org/apache/hadoop/hive/ql/exec/JoinOperator.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/exec/JoinOperator.java (revision 1379313) +++ ql/src/java/org/apache/hadoop/hive/ql/exec/JoinOperator.java (working copy) @@ -266,4 +266,11 @@ } } + @Override + public boolean supportSkewJoinOptimization() { + // Since skew join optimization makes a copy of the tree above joins, and + // there is no multi-query optimization in place, let us not use skew join + // optimizations for now. + return false; + } } Index: ql/src/java/org/apache/hadoop/hive/ql/exec/FilterOperator.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/exec/FilterOperator.java (revision 1379313) +++ ql/src/java/org/apache/hadoop/hive/ql/exec/FilterOperator.java (working copy) @@ -156,4 +156,9 @@ public OperatorType getType() { return OperatorType.FILTER; } + + @Override + public boolean supportSkewJoinOptimization() { + return true; + } } Index: ql/src/java/org/apache/hadoop/hive/ql/exec/SelectOperator.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/exec/SelectOperator.java (revision 1379313) +++ ql/src/java/org/apache/hadoop/hive/ql/exec/SelectOperator.java (working copy) @@ -96,4 +96,9 @@ public OperatorType getType() { return OperatorType.SELECT; } + + @Override + public boolean supportSkewJoinOptimization() { + return true; + } } Index: ql/src/java/org/apache/hadoop/hive/ql/exec/TableScanOperator.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/exec/TableScanOperator.java (revision 1379313) +++ ql/src/java/org/apache/hadoop/hive/ql/exec/TableScanOperator.java (working copy) @@ -274,4 +274,9 @@ } } } + + @Override + public boolean supportSkewJoinOptimization() { + return true; + } } Index: ql/src/java/org/apache/hadoop/hive/ql/exec/Operator.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/exec/Operator.java (revision 1379313) +++ ql/src/java/org/apache/hadoop/hive/ql/exec/Operator.java (working copy) @@ -1334,6 +1334,10 @@ public void cleanUpInputFileChangedOp() throws HiveException { } + public boolean supportSkewJoinOptimization() { + return false; + } + @Override public Operator clone() throws CloneNotSupportedException { Index: ql/src/java/org/apache/hadoop/hive/ql/exec/ReduceSinkOperator.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/exec/ReduceSinkOperator.java (revision 1379313) +++ ql/src/java/org/apache/hadoop/hive/ql/exec/ReduceSinkOperator.java (working copy) @@ -39,9 +39,9 @@ import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; +import org.apache.hadoop.hive.serde2.objectinspector.StandardUnionObjectInspector.StandardUnion; import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.UnionObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.StandardUnionObjectInspector.StandardUnion; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; Index: ql/src/java/org/apache/hadoop/hive/ql/util/ObjectPair.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/util/ObjectPair.java (revision 0) +++ ql/src/java/org/apache/hadoop/hive/ql/util/ObjectPair.java (working copy) @@ -0,0 +1,47 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.util; + +public class ObjectPair { + private F first; + private S second; + + public ObjectPair() {} + + public ObjectPair(F first, S second) { + this.first = first; + this.second = second; + } + + public F getFirst() { + return first; + } + + public void setFirst(F first) { + this.first = first; + } + + public S getSecond() { + return second; + } + + public void setSecond(S second) { + this.second = second; + } +} Index: ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeGenericFuncDesc.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeGenericFuncDesc.java (revision 1379313) +++ ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeGenericFuncDesc.java (working copy) @@ -268,6 +268,14 @@ return true; } + @Override + public int hashCode() { + int ret = super.hashCode(); + + ret += childExprs == null ? 0 : childExprs.hashCode(); + return ret; + } + public boolean isSortedExpr() { return isSortedExpr; } @@ -275,5 +283,4 @@ public void setSortedExpr(boolean isSortedExpr) { this.isSortedExpr = isSortedExpr; } - } Index: ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeDesc.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeDesc.java (revision 1379313) +++ ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeDesc.java (working copy) @@ -53,6 +53,11 @@ // object equality - isSame means that the objects are semantically equal. public abstract boolean isSame(Object o); + @Override + public int hashCode() { + return typeInfo.hashCode(); + } + public TypeInfo getTypeInfo() { return typeInfo; } @@ -116,5 +121,10 @@ return this.exprNodeDesc.isSame(((ExprNodeDescEqualityWrapper)other).getExprNodeDesc()); } + + @Override + public int hashCode() { + return exprNodeDesc == null ? 0 : exprNodeDesc.hashCode(); + } } } Index: ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeConstantDesc.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeConstantDesc.java (revision 1379313) +++ ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeConstantDesc.java (working copy) @@ -108,4 +108,12 @@ return true; } + + @Override + public int hashCode() { + int ret = super.hashCode(); + + ret += value == null ? 0 : value.hashCode(); + return ret; + } } Index: ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeFieldDesc.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeFieldDesc.java (revision 1379313) +++ ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeFieldDesc.java (working copy) @@ -122,4 +122,14 @@ return true; } + + @Override + public int hashCode() { + int ret = super.hashCode(); + + ret += desc == null ? 0 : desc.hashCode(); + ret += fieldName == null ? 0 : fieldName.hashCode(); + ret += isList == null ? 0 : isList.hashCode(); + return ret; + } } Index: ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeColumnDesc.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeColumnDesc.java (revision 1379313) +++ ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeColumnDesc.java (working copy) @@ -156,4 +156,13 @@ public void setSkewedCol(boolean isSkewedCol) { this.isSkewedCol = isSkewedCol; } + + @Override + public int hashCode() { + int ret = super.hashCode(); + + ret += column == null ? 0 : column.hashCode(); + ret += tabAlias == null ? 0 : tabAlias.hashCode(); + return ret; + } } Index: ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java (revision 1379313) +++ ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java (working copy) @@ -8335,7 +8335,7 @@ /** * This code is commented out pending further testing/development - * for (Task t: rootTasks) + * for (Task t: rootTasks) * t.localizeMRTmpFiles(ctx); */ }