Index: conf/hive-default.xml.template
===================================================================
--- conf/hive-default.xml.template (revision 1383248)
+++ conf/hive-default.xml.template (working copy)
@@ -426,6 +426,27 @@
+ hive.optimize.skewjoin.compiletime
+ false
+ Whether to create a separate plan for skewed keys for the tables in the join.
+ This is based on the skewed keys stored in the metadata. At compile time, the plan is broken
+ into different joins: one for the skewed keys, and the other for the remaining keys. And then,
+ a union is performed for the 2 joins generated above. So unless the same skewed key is present
+ in both the joined tables, the join for the skewed key will be performed as a map-side join.
+
+ The main difference between this paramater and hive.optimize.skewjoin is that this parameter
+ uses the skew information stored in the metastore to optimize the plan at compile time itself.
+ If there is no skew information in the metadata, this parameter will not have any affect.
+ Both hive.optimize.skewjoin.compiletime and hive.optimize.skewjoin should be set to true.
+ Ideally, hive.optimize.skewjoin should be renamed as hive.optimize.skewjoin.runtime, but not doing
+ so for backward compatibility.
+
+ If the skew information is correctly stored in the metadata, hive.optimize.skewjoin.compiletime
+ would change the query plan to take care of it, and hive.optimize.skewjoin will be a no-op.
+
+
+
+
hive.multigroupby.singlemr
false
Whether to optimize multi group by query to generate single M/R
@@ -459,7 +480,13 @@
hive.optimize.skewjoin
false
- Whether to enable skew join optimization.
+ Whether to enable skew join optimization.
+ The algorithm is as follows: At runtime, detect the keys with a large skew. Instead of
+ processing those keys, store them temporarily in a hdfs directory. In a follow-up map-reduce
+ job, process those skewed keys. The same key need not be skewed for all the tables, and so,
+ the follow-up map-reduce job (for the skewed keys) would be much faster, since it would be a
+ map-join.
+
Index: common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
===================================================================
--- common/src/java/org/apache/hadoop/hive/conf/HiveConf.java (revision 1383248)
+++ common/src/java/org/apache/hadoop/hive/conf/HiveConf.java (working copy)
@@ -495,6 +495,9 @@
HIVEOPTSORTMERGEBUCKETMAPJOIN("hive.optimize.bucketmapjoin.sortedmerge", false), // try to use sorted merge bucket map join
HIVEOPTREDUCEDEDUPLICATION("hive.optimize.reducededuplication", true),
+ // optimize skewed join by changing the query plan at compile time
+ HIVE_OPTIMIZE_SKEWJOIN_COMPILETIME("hive.optimize.skewjoin.compiletime", false),
+
// Indexes
HIVEOPTINDEXFILTER_COMPACT_MINSIZE("hive.optimize.index.filter.compact.minsize", (long) 5 * 1024 * 1024 * 1024), // 5G
HIVEOPTINDEXFILTER_COMPACT_MAXSIZE("hive.optimize.index.filter.compact.maxsize", (long) -1), // infinity
Index: ql/src/test/results/clientpositive/skewjoinopt8.q.out
===================================================================
--- ql/src/test/results/clientpositive/skewjoinopt8.q.out (revision 0)
+++ ql/src/test/results/clientpositive/skewjoinopt8.q.out (working copy)
@@ -0,0 +1,299 @@
+PREHOOK: query: CREATE TABLE T1(key STRING, val STRING) STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: CREATE TABLE T1(key STRING, val STRING) STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@T1
+PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1
+PREHOOK: type: LOAD
+PREHOOK: Output: default@t1
+POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@t1
+PREHOOK: query: CREATE TABLE T2(key STRING, val STRING)
+SKEWED BY (key) ON ((3), (8)) STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: CREATE TABLE T2(key STRING, val STRING)
+SKEWED BY (key) ON ((3), (8)) STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@T2
+PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE T2
+PREHOOK: type: LOAD
+PREHOOK: Output: default@t2
+POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE T2
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@t2
+PREHOOK: query: CREATE TABLE T3(key STRING, val STRING) STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: CREATE TABLE T3(key STRING, val STRING) STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@T3
+PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T3.txt' INTO TABLE T3
+PREHOOK: type: LOAD
+PREHOOK: Output: default@t3
+POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T3.txt' INTO TABLE T3
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@t3
+PREHOOK: query: -- This test is for validating skewed join compile time optimization for more than
+-- 2 tables. The join key is the same, and so a 3-way join would be performed.
+-- 1 of the 3 tables are skewed on the join key
+EXPLAIN
+SELECT a.*, b.*, c.* FROM T1 a JOIN T2 b ON a.key = b.key JOIN T3 c on a.key = c.key
+PREHOOK: type: QUERY
+POSTHOOK: query: -- This test is for validating skewed join compile time optimization for more than
+-- 2 tables. The join key is the same, and so a 3-way join would be performed.
+-- 1 of the 3 tables are skewed on the join key
+EXPLAIN
+SELECT a.*, b.*, c.* FROM T1 a JOIN T2 b ON a.key = b.key JOIN T3 c on a.key = c.key
+POSTHOOK: type: QUERY
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_JOIN (TOK_TABREF (TOK_TABNAME T1) a) (TOK_TABREF (TOK_TABNAME T2) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key))) (TOK_TABREF (TOK_TABNAME T3) c) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL c) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_ALLCOLREF (TOK_TABNAME a))) (TOK_SELEXPR (TOK_ALLCOLREF (TOK_TABNAME b))) (TOK_SELEXPR (TOK_ALLCOLREF (TOK_TABNAME c))))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-2 depends on stages: Stage-1, Stage-5
+ Stage-5 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ subquery1:a
+ TableScan
+ alias: a
+ Filter Operator
+ predicate:
+ expr: (not ((key = '3') or (key = '8')))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ tag: 0
+ value expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ subquery1:b
+ TableScan
+ alias: b
+ Filter Operator
+ predicate:
+ expr: (not ((key = '3') or (key = '8')))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ tag: 1
+ value expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ subquery1:c
+ TableScan
+ alias: c
+ Filter Operator
+ predicate:
+ expr: (not ((key = '3') or (key = '8')))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ tag: 2
+ value expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ Reduce Operator Tree:
+ Join Operator
+ condition map:
+ Inner Join 0 to 1
+ Inner Join 0 to 2
+ condition expressions:
+ 0 {VALUE._col0} {VALUE._col1}
+ 1 {VALUE._col0} {VALUE._col1}
+ 2 {VALUE._col0} {VALUE._col1}
+ handleSkewJoin: false
+ outputColumnNames: _col0, _col1, _col4, _col5, _col8, _col9
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col4
+ type: string
+ expr: _col5
+ type: string
+ expr: _col8
+ type: string
+ expr: _col9
+ type: string
+ outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-2
+ Map Reduce
+ Alias -> Map Operator Tree:
+#### A masked pattern was here ####
+ TableScan
+ Union
+ Select Operator
+ SELECT * : (no compute)
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+#### A masked pattern was here ####
+ TableScan
+ Union
+ Select Operator
+ SELECT * : (no compute)
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-5
+ Map Reduce
+ Alias -> Map Operator Tree:
+ a
+ TableScan
+ alias: a
+ Filter Operator
+ predicate:
+ expr: ((key = '3') or (key = '8'))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ tag: 0
+ value expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ b
+ TableScan
+ alias: b
+ Filter Operator
+ predicate:
+ expr: ((key = '3') or (key = '8'))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ tag: 1
+ value expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ c
+ TableScan
+ alias: c
+ Filter Operator
+ predicate:
+ expr: ((key = '3') or (key = '8'))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ tag: 2
+ value expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ Reduce Operator Tree:
+ Join Operator
+ condition map:
+ Inner Join 0 to 1
+ Inner Join 0 to 2
+ condition expressions:
+ 0 {VALUE._col0} {VALUE._col1}
+ 1 {VALUE._col0} {VALUE._col1}
+ 2 {VALUE._col0} {VALUE._col1}
+ handleSkewJoin: false
+ outputColumnNames: _col0, _col1, _col4, _col5, _col8, _col9
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col4
+ type: string
+ expr: _col5
+ type: string
+ expr: _col8
+ type: string
+ expr: _col9
+ type: string
+ outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: SELECT a.*, b.*, c.* FROM T1 a JOIN T2 b ON a.key = b.key JOIN T3 c on a.key = c.key
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+PREHOOK: Input: default@t2
+PREHOOK: Input: default@t3
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT a.*, b.*, c.* FROM T1 a JOIN T2 b ON a.key = b.key JOIN T3 c on a.key = c.key
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t1
+POSTHOOK: Input: default@t2
+POSTHOOK: Input: default@t3
+#### A masked pattern was here ####
+2 12 2 22 2 12
Index: ql/src/test/results/clientpositive/skewjoinopt3.q.out
===================================================================
--- ql/src/test/results/clientpositive/skewjoinopt3.q.out (revision 0)
+++ ql/src/test/results/clientpositive/skewjoinopt3.q.out (working copy)
@@ -0,0 +1,456 @@
+PREHOOK: query: CREATE TABLE T1(key STRING, val STRING)
+SKEWED BY (key) ON ((2), (8)) STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: CREATE TABLE T1(key STRING, val STRING)
+SKEWED BY (key) ON ((2), (8)) STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@T1
+PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1
+PREHOOK: type: LOAD
+PREHOOK: Output: default@t1
+POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@t1
+PREHOOK: query: CREATE TABLE T2(key STRING, val STRING)
+SKEWED BY (key) ON ((3), (8)) STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: CREATE TABLE T2(key STRING, val STRING)
+SKEWED BY (key) ON ((3), (8)) STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@T2
+PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE T2
+PREHOOK: type: LOAD
+PREHOOK: Output: default@t2
+POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE T2
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@t2
+PREHOOK: query: -- a simple query with skew on both the tables. One of the skewed
+-- value is common to both the tables. The skewed value should not be
+-- repeated in the filter.
+
+EXPLAIN
+SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key
+PREHOOK: type: QUERY
+POSTHOOK: query: -- a simple query with skew on both the tables. One of the skewed
+-- value is common to both the tables. The skewed value should not be
+-- repeated in the filter.
+
+EXPLAIN
+SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key
+POSTHOOK: type: QUERY
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME T1) a) (TOK_TABREF (TOK_TABNAME T2) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_ALLCOLREF (TOK_TABNAME a))) (TOK_SELEXPR (TOK_ALLCOLREF (TOK_TABNAME b))))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-2 depends on stages: Stage-1, Stage-4
+ Stage-4 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ subquery1:a
+ TableScan
+ alias: a
+ Filter Operator
+ predicate:
+ expr: (not (((key = '2') or (key = '8')) or (key = '3')))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ tag: 0
+ value expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ subquery1:b
+ TableScan
+ alias: b
+ Filter Operator
+ predicate:
+ expr: (not (((key = '2') or (key = '8')) or (key = '3')))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ tag: 1
+ value expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ Reduce Operator Tree:
+ Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {VALUE._col0} {VALUE._col1}
+ 1 {VALUE._col0} {VALUE._col1}
+ handleSkewJoin: false
+ outputColumnNames: _col0, _col1, _col4, _col5
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col4
+ type: string
+ expr: _col5
+ type: string
+ outputColumnNames: _col0, _col1, _col2, _col3
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-2
+ Map Reduce
+ Alias -> Map Operator Tree:
+#### A masked pattern was here ####
+ TableScan
+ Union
+ Select Operator
+ SELECT * : (no compute)
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+#### A masked pattern was here ####
+ TableScan
+ Union
+ Select Operator
+ SELECT * : (no compute)
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-4
+ Map Reduce
+ Alias -> Map Operator Tree:
+ a
+ TableScan
+ alias: a
+ Filter Operator
+ predicate:
+ expr: (((key = '2') or (key = '8')) or (key = '3'))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ tag: 0
+ value expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ b
+ TableScan
+ alias: b
+ Filter Operator
+ predicate:
+ expr: (((key = '2') or (key = '8')) or (key = '3'))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ tag: 1
+ value expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ Reduce Operator Tree:
+ Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {VALUE._col0} {VALUE._col1}
+ 1 {VALUE._col0} {VALUE._col1}
+ handleSkewJoin: false
+ outputColumnNames: _col0, _col1, _col4, _col5
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col4
+ type: string
+ expr: _col5
+ type: string
+ outputColumnNames: _col0, _col1, _col2, _col3
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+PREHOOK: Input: default@t2
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t1
+POSTHOOK: Input: default@t2
+#### A masked pattern was here ####
+2 12 2 22
+3 13 3 13
+8 18 8 18
+8 18 8 18
+8 28 8 18
+8 28 8 18
+PREHOOK: query: -- test outer joins also
+
+EXPLAIN
+SELECT a.*, b.* FROM T1 a FULL OUTER JOIN T2 b ON a.key = b.key
+PREHOOK: type: QUERY
+POSTHOOK: query: -- test outer joins also
+
+EXPLAIN
+SELECT a.*, b.* FROM T1 a FULL OUTER JOIN T2 b ON a.key = b.key
+POSTHOOK: type: QUERY
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_FULLOUTERJOIN (TOK_TABREF (TOK_TABNAME T1) a) (TOK_TABREF (TOK_TABNAME T2) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_ALLCOLREF (TOK_TABNAME a))) (TOK_SELEXPR (TOK_ALLCOLREF (TOK_TABNAME b))))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-2 depends on stages: Stage-1, Stage-4
+ Stage-4 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ subquery1:a
+ TableScan
+ alias: a
+ Filter Operator
+ predicate:
+ expr: (not (((key = '2') or (key = '8')) or (key = '3')))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ tag: 0
+ value expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ subquery1:b
+ TableScan
+ alias: b
+ Filter Operator
+ predicate:
+ expr: (not (((key = '2') or (key = '8')) or (key = '3')))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ tag: 1
+ value expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ Reduce Operator Tree:
+ Join Operator
+ condition map:
+ Outer Join 0 to 1
+ condition expressions:
+ 0 {VALUE._col0} {VALUE._col1}
+ 1 {VALUE._col0} {VALUE._col1}
+ handleSkewJoin: false
+ outputColumnNames: _col0, _col1, _col4, _col5
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col4
+ type: string
+ expr: _col5
+ type: string
+ outputColumnNames: _col0, _col1, _col2, _col3
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-2
+ Map Reduce
+ Alias -> Map Operator Tree:
+#### A masked pattern was here ####
+ TableScan
+ Union
+ Select Operator
+ SELECT * : (no compute)
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+#### A masked pattern was here ####
+ TableScan
+ Union
+ Select Operator
+ SELECT * : (no compute)
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-4
+ Map Reduce
+ Alias -> Map Operator Tree:
+ a
+ TableScan
+ alias: a
+ Filter Operator
+ predicate:
+ expr: (((key = '2') or (key = '8')) or (key = '3'))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ tag: 0
+ value expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ b
+ TableScan
+ alias: b
+ Filter Operator
+ predicate:
+ expr: (((key = '2') or (key = '8')) or (key = '3'))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ tag: 1
+ value expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ Reduce Operator Tree:
+ Join Operator
+ condition map:
+ Outer Join 0 to 1
+ condition expressions:
+ 0 {VALUE._col0} {VALUE._col1}
+ 1 {VALUE._col0} {VALUE._col1}
+ handleSkewJoin: false
+ outputColumnNames: _col0, _col1, _col4, _col5
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col4
+ type: string
+ expr: _col5
+ type: string
+ outputColumnNames: _col0, _col1, _col2, _col3
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: SELECT a.*, b.* FROM T1 a FULL OUTER JOIN T2 b ON a.key = b.key
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+PREHOOK: Input: default@t2
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT a.*, b.* FROM T1 a FULL OUTER JOIN T2 b ON a.key = b.key
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t1
+POSTHOOK: Input: default@t2
+#### A masked pattern was here ####
+1 11 NULL NULL
+NULL NULL 4 14
+NULL NULL 5 15
+7 17 NULL NULL
+2 12 2 22
+3 13 3 13
+8 18 8 18
+8 18 8 18
+8 28 8 18
+8 28 8 18
Index: ql/src/test/results/clientpositive/skewjoinopt15.q.out
===================================================================
--- ql/src/test/results/clientpositive/skewjoinopt15.q.out (revision 0)
+++ ql/src/test/results/clientpositive/skewjoinopt15.q.out (working copy)
@@ -0,0 +1,906 @@
+PREHOOK: query: CREATE TABLE tmpT1(key STRING, val STRING) STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: CREATE TABLE tmpT1(key STRING, val STRING) STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@tmpT1
+PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE tmpT1
+PREHOOK: type: LOAD
+PREHOOK: Output: default@tmpt1
+POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE tmpT1
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@tmpt1
+PREHOOK: query: -- testing skew on other data types - int
+CREATE TABLE T1(key INT, val STRING) SKEWED BY (key) ON ((2))
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: -- testing skew on other data types - int
+CREATE TABLE T1(key INT, val STRING) SKEWED BY (key) ON ((2))
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@T1
+PREHOOK: query: INSERT OVERWRITE TABLE T1 SELECT key, val FROM tmpT1
+PREHOOK: type: QUERY
+PREHOOK: Input: default@tmpt1
+PREHOOK: Output: default@t1
+POSTHOOK: query: INSERT OVERWRITE TABLE T1 SELECT key, val FROM tmpT1
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@tmpt1
+POSTHOOK: Output: default@t1
+POSTHOOK: Lineage: t1.key EXPRESSION [(tmpt1)tmpt1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t1.val SIMPLE [(tmpt1)tmpt1.FieldSchema(name:val, type:string, comment:null), ]
+PREHOOK: query: CREATE TABLE tmpT2(key STRING, val STRING) STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: CREATE TABLE tmpT2(key STRING, val STRING) STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@tmpT2
+POSTHOOK: Lineage: t1.key EXPRESSION [(tmpt1)tmpt1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t1.val SIMPLE [(tmpt1)tmpt1.FieldSchema(name:val, type:string, comment:null), ]
+PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE tmpT2
+PREHOOK: type: LOAD
+PREHOOK: Output: default@tmpt2
+POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE tmpT2
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@tmpt2
+POSTHOOK: Lineage: t1.key EXPRESSION [(tmpt1)tmpt1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t1.val SIMPLE [(tmpt1)tmpt1.FieldSchema(name:val, type:string, comment:null), ]
+PREHOOK: query: CREATE TABLE T2(key INT, val STRING) SKEWED BY (key) ON ((3))
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: CREATE TABLE T2(key INT, val STRING) SKEWED BY (key) ON ((3))
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@T2
+POSTHOOK: Lineage: t1.key EXPRESSION [(tmpt1)tmpt1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t1.val SIMPLE [(tmpt1)tmpt1.FieldSchema(name:val, type:string, comment:null), ]
+PREHOOK: query: INSERT OVERWRITE TABLE T2 SELECT key, val FROM tmpT2
+PREHOOK: type: QUERY
+PREHOOK: Input: default@tmpt2
+PREHOOK: Output: default@t2
+POSTHOOK: query: INSERT OVERWRITE TABLE T2 SELECT key, val FROM tmpT2
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@tmpt2
+POSTHOOK: Output: default@t2
+POSTHOOK: Lineage: t1.key EXPRESSION [(tmpt1)tmpt1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t1.val SIMPLE [(tmpt1)tmpt1.FieldSchema(name:val, type:string, comment:null), ]
+POSTHOOK: Lineage: t2.key EXPRESSION [(tmpt2)tmpt2.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t2.val SIMPLE [(tmpt2)tmpt2.FieldSchema(name:val, type:string, comment:null), ]
+PREHOOK: query: -- The skewed key is a integer column.
+-- Otherwise this test is similar to skewjoinopt1.q
+-- Both the joined tables are skewed, and the joined column
+-- is an integer
+EXPLAIN
+SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key
+PREHOOK: type: QUERY
+POSTHOOK: query: -- The skewed key is a integer column.
+-- Otherwise this test is similar to skewjoinopt1.q
+-- Both the joined tables are skewed, and the joined column
+-- is an integer
+EXPLAIN
+SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: t1.key EXPRESSION [(tmpt1)tmpt1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t1.val SIMPLE [(tmpt1)tmpt1.FieldSchema(name:val, type:string, comment:null), ]
+POSTHOOK: Lineage: t2.key EXPRESSION [(tmpt2)tmpt2.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t2.val SIMPLE [(tmpt2)tmpt2.FieldSchema(name:val, type:string, comment:null), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME T1) a) (TOK_TABREF (TOK_TABNAME T2) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_ALLCOLREF (TOK_TABNAME a))) (TOK_SELEXPR (TOK_ALLCOLREF (TOK_TABNAME b))))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-2 depends on stages: Stage-1, Stage-4
+ Stage-4 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ subquery1:a
+ TableScan
+ alias: a
+ Filter Operator
+ predicate:
+ expr: (not ((key = 2) or (key = 3)))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: int
+ sort order: +
+ Map-reduce partition columns:
+ expr: key
+ type: int
+ tag: 0
+ value expressions:
+ expr: key
+ type: int
+ expr: val
+ type: string
+ subquery1:b
+ TableScan
+ alias: b
+ Filter Operator
+ predicate:
+ expr: (not ((key = 2) or (key = 3)))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: int
+ sort order: +
+ Map-reduce partition columns:
+ expr: key
+ type: int
+ tag: 1
+ value expressions:
+ expr: key
+ type: int
+ expr: val
+ type: string
+ Reduce Operator Tree:
+ Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {VALUE._col0} {VALUE._col1}
+ 1 {VALUE._col0} {VALUE._col1}
+ handleSkewJoin: false
+ outputColumnNames: _col0, _col1, _col4, _col5
+ Select Operator
+ expressions:
+ expr: _col0
+ type: int
+ expr: _col1
+ type: string
+ expr: _col4
+ type: int
+ expr: _col5
+ type: string
+ outputColumnNames: _col0, _col1, _col2, _col3
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-2
+ Map Reduce
+ Alias -> Map Operator Tree:
+#### A masked pattern was here ####
+ TableScan
+ Union
+ Select Operator
+ SELECT * : (no compute)
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+#### A masked pattern was here ####
+ TableScan
+ Union
+ Select Operator
+ SELECT * : (no compute)
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-4
+ Map Reduce
+ Alias -> Map Operator Tree:
+ a
+ TableScan
+ alias: a
+ Filter Operator
+ predicate:
+ expr: ((key = 2) or (key = 3))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: int
+ sort order: +
+ Map-reduce partition columns:
+ expr: key
+ type: int
+ tag: 0
+ value expressions:
+ expr: key
+ type: int
+ expr: val
+ type: string
+ b
+ TableScan
+ alias: b
+ Filter Operator
+ predicate:
+ expr: ((key = 2) or (key = 3))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: int
+ sort order: +
+ Map-reduce partition columns:
+ expr: key
+ type: int
+ tag: 1
+ value expressions:
+ expr: key
+ type: int
+ expr: val
+ type: string
+ Reduce Operator Tree:
+ Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {VALUE._col0} {VALUE._col1}
+ 1 {VALUE._col0} {VALUE._col1}
+ handleSkewJoin: false
+ outputColumnNames: _col0, _col1, _col4, _col5
+ Select Operator
+ expressions:
+ expr: _col0
+ type: int
+ expr: _col1
+ type: string
+ expr: _col4
+ type: int
+ expr: _col5
+ type: string
+ outputColumnNames: _col0, _col1, _col2, _col3
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+PREHOOK: Input: default@t2
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t1
+POSTHOOK: Input: default@t2
+#### A masked pattern was here ####
+POSTHOOK: Lineage: t1.key EXPRESSION [(tmpt1)tmpt1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t1.val SIMPLE [(tmpt1)tmpt1.FieldSchema(name:val, type:string, comment:null), ]
+POSTHOOK: Lineage: t2.key EXPRESSION [(tmpt2)tmpt2.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t2.val SIMPLE [(tmpt2)tmpt2.FieldSchema(name:val, type:string, comment:null), ]
+8 18 8 18
+8 18 8 18
+8 28 8 18
+8 28 8 18
+2 12 2 22
+3 13 3 13
+PREHOOK: query: -- test outer joins also
+
+EXPLAIN
+SELECT a.*, b.* FROM T1 a RIGHT OUTER JOIN T2 b ON a.key = b.key
+PREHOOK: type: QUERY
+POSTHOOK: query: -- test outer joins also
+
+EXPLAIN
+SELECT a.*, b.* FROM T1 a RIGHT OUTER JOIN T2 b ON a.key = b.key
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: t1.key EXPRESSION [(tmpt1)tmpt1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t1.val SIMPLE [(tmpt1)tmpt1.FieldSchema(name:val, type:string, comment:null), ]
+POSTHOOK: Lineage: t2.key EXPRESSION [(tmpt2)tmpt2.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t2.val SIMPLE [(tmpt2)tmpt2.FieldSchema(name:val, type:string, comment:null), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_RIGHTOUTERJOIN (TOK_TABREF (TOK_TABNAME T1) a) (TOK_TABREF (TOK_TABNAME T2) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_ALLCOLREF (TOK_TABNAME a))) (TOK_SELEXPR (TOK_ALLCOLREF (TOK_TABNAME b))))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-2 depends on stages: Stage-1, Stage-4
+ Stage-4 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ subquery1:a
+ TableScan
+ alias: a
+ Filter Operator
+ predicate:
+ expr: (not ((key = 2) or (key = 3)))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: int
+ sort order: +
+ Map-reduce partition columns:
+ expr: key
+ type: int
+ tag: 0
+ value expressions:
+ expr: key
+ type: int
+ expr: val
+ type: string
+ subquery1:b
+ TableScan
+ alias: b
+ Filter Operator
+ predicate:
+ expr: (not ((key = 2) or (key = 3)))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: int
+ sort order: +
+ Map-reduce partition columns:
+ expr: key
+ type: int
+ tag: 1
+ value expressions:
+ expr: key
+ type: int
+ expr: val
+ type: string
+ Reduce Operator Tree:
+ Join Operator
+ condition map:
+ Right Outer Join0 to 1
+ condition expressions:
+ 0 {VALUE._col0} {VALUE._col1}
+ 1 {VALUE._col0} {VALUE._col1}
+ handleSkewJoin: false
+ outputColumnNames: _col0, _col1, _col4, _col5
+ Select Operator
+ expressions:
+ expr: _col0
+ type: int
+ expr: _col1
+ type: string
+ expr: _col4
+ type: int
+ expr: _col5
+ type: string
+ outputColumnNames: _col0, _col1, _col2, _col3
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-2
+ Map Reduce
+ Alias -> Map Operator Tree:
+#### A masked pattern was here ####
+ TableScan
+ Union
+ Select Operator
+ SELECT * : (no compute)
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+#### A masked pattern was here ####
+ TableScan
+ Union
+ Select Operator
+ SELECT * : (no compute)
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-4
+ Map Reduce
+ Alias -> Map Operator Tree:
+ a
+ TableScan
+ alias: a
+ Filter Operator
+ predicate:
+ expr: ((key = 2) or (key = 3))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: int
+ sort order: +
+ Map-reduce partition columns:
+ expr: key
+ type: int
+ tag: 0
+ value expressions:
+ expr: key
+ type: int
+ expr: val
+ type: string
+ b
+ TableScan
+ alias: b
+ Filter Operator
+ predicate:
+ expr: ((key = 2) or (key = 3))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: int
+ sort order: +
+ Map-reduce partition columns:
+ expr: key
+ type: int
+ tag: 1
+ value expressions:
+ expr: key
+ type: int
+ expr: val
+ type: string
+ Reduce Operator Tree:
+ Join Operator
+ condition map:
+ Right Outer Join0 to 1
+ condition expressions:
+ 0 {VALUE._col0} {VALUE._col1}
+ 1 {VALUE._col0} {VALUE._col1}
+ handleSkewJoin: false
+ outputColumnNames: _col0, _col1, _col4, _col5
+ Select Operator
+ expressions:
+ expr: _col0
+ type: int
+ expr: _col1
+ type: string
+ expr: _col4
+ type: int
+ expr: _col5
+ type: string
+ outputColumnNames: _col0, _col1, _col2, _col3
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: SELECT a.*, b.* FROM T1 a RIGHT OUTER JOIN T2 b ON a.key = b.key
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+PREHOOK: Input: default@t2
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT a.*, b.* FROM T1 a RIGHT OUTER JOIN T2 b ON a.key = b.key
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t1
+POSTHOOK: Input: default@t2
+#### A masked pattern was here ####
+POSTHOOK: Lineage: t1.key EXPRESSION [(tmpt1)tmpt1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t1.val SIMPLE [(tmpt1)tmpt1.FieldSchema(name:val, type:string, comment:null), ]
+POSTHOOK: Lineage: t2.key EXPRESSION [(tmpt2)tmpt2.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t2.val SIMPLE [(tmpt2)tmpt2.FieldSchema(name:val, type:string, comment:null), ]
+NULL NULL 4 14
+NULL NULL 5 15
+8 18 8 18
+8 18 8 18
+8 28 8 18
+8 28 8 18
+2 12 2 22
+3 13 3 13
+PREHOOK: query: -- an aggregation at the end should not change anything
+
+EXPLAIN
+SELECT count(1) FROM T1 a JOIN T2 b ON a.key = b.key
+PREHOOK: type: QUERY
+POSTHOOK: query: -- an aggregation at the end should not change anything
+
+EXPLAIN
+SELECT count(1) FROM T1 a JOIN T2 b ON a.key = b.key
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: t1.key EXPRESSION [(tmpt1)tmpt1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t1.val SIMPLE [(tmpt1)tmpt1.FieldSchema(name:val, type:string, comment:null), ]
+POSTHOOK: Lineage: t2.key EXPRESSION [(tmpt2)tmpt2.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t2.val SIMPLE [(tmpt2)tmpt2.FieldSchema(name:val, type:string, comment:null), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME T1) a) (TOK_TABREF (TOK_TABNAME T2) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTION count 1)))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-2 depends on stages: Stage-1, Stage-4
+ Stage-4 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ subquery1:a
+ TableScan
+ alias: a
+ Filter Operator
+ predicate:
+ expr: (not ((key = 2) or (key = 3)))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: int
+ sort order: +
+ Map-reduce partition columns:
+ expr: key
+ type: int
+ tag: 0
+ subquery1:b
+ TableScan
+ alias: b
+ Filter Operator
+ predicate:
+ expr: (not ((key = 2) or (key = 3)))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: int
+ sort order: +
+ Map-reduce partition columns:
+ expr: key
+ type: int
+ tag: 1
+ Reduce Operator Tree:
+ Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ Select Operator
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-2
+ Map Reduce
+ Alias -> Map Operator Tree:
+#### A masked pattern was here ####
+ TableScan
+ Union
+ Select Operator
+ SELECT * : (no compute)
+ Group By Operator
+ aggregations:
+ expr: count(1)
+ bucketGroup: false
+ mode: hash
+ outputColumnNames: _col0
+ Reduce Output Operator
+ sort order:
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: bigint
+#### A masked pattern was here ####
+ TableScan
+ Union
+ Select Operator
+ SELECT * : (no compute)
+ Group By Operator
+ aggregations:
+ expr: count(1)
+ bucketGroup: false
+ mode: hash
+ outputColumnNames: _col0
+ Reduce Output Operator
+ sort order:
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: bigint
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ mode: mergepartial
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: bigint
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-4
+ Map Reduce
+ Alias -> Map Operator Tree:
+ a
+ TableScan
+ alias: a
+ Filter Operator
+ predicate:
+ expr: ((key = 2) or (key = 3))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: int
+ sort order: +
+ Map-reduce partition columns:
+ expr: key
+ type: int
+ tag: 0
+ b
+ TableScan
+ alias: b
+ Filter Operator
+ predicate:
+ expr: ((key = 2) or (key = 3))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: int
+ sort order: +
+ Map-reduce partition columns:
+ expr: key
+ type: int
+ tag: 1
+ Reduce Operator Tree:
+ Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ Select Operator
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: SELECT count(1) FROM T1 a JOIN T2 b ON a.key = b.key
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+PREHOOK: Input: default@t2
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT count(1) FROM T1 a JOIN T2 b ON a.key = b.key
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t1
+POSTHOOK: Input: default@t2
+#### A masked pattern was here ####
+POSTHOOK: Lineage: t1.key EXPRESSION [(tmpt1)tmpt1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t1.val SIMPLE [(tmpt1)tmpt1.FieldSchema(name:val, type:string, comment:null), ]
+POSTHOOK: Lineage: t2.key EXPRESSION [(tmpt2)tmpt2.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t2.val SIMPLE [(tmpt2)tmpt2.FieldSchema(name:val, type:string, comment:null), ]
+6
+PREHOOK: query: EXPLAIN
+SELECT count(1) FROM T1 a RIGHT OUTER JOIN T2 b ON a.key = b.key
+PREHOOK: type: QUERY
+POSTHOOK: query: EXPLAIN
+SELECT count(1) FROM T1 a RIGHT OUTER JOIN T2 b ON a.key = b.key
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: t1.key EXPRESSION [(tmpt1)tmpt1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t1.val SIMPLE [(tmpt1)tmpt1.FieldSchema(name:val, type:string, comment:null), ]
+POSTHOOK: Lineage: t2.key EXPRESSION [(tmpt2)tmpt2.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t2.val SIMPLE [(tmpt2)tmpt2.FieldSchema(name:val, type:string, comment:null), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_RIGHTOUTERJOIN (TOK_TABREF (TOK_TABNAME T1) a) (TOK_TABREF (TOK_TABNAME T2) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTION count 1)))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-2 depends on stages: Stage-1, Stage-4
+ Stage-4 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ subquery1:a
+ TableScan
+ alias: a
+ Filter Operator
+ predicate:
+ expr: (not ((key = 2) or (key = 3)))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: int
+ sort order: +
+ Map-reduce partition columns:
+ expr: key
+ type: int
+ tag: 0
+ subquery1:b
+ TableScan
+ alias: b
+ Filter Operator
+ predicate:
+ expr: (not ((key = 2) or (key = 3)))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: int
+ sort order: +
+ Map-reduce partition columns:
+ expr: key
+ type: int
+ tag: 1
+ Reduce Operator Tree:
+ Join Operator
+ condition map:
+ Right Outer Join0 to 1
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ Select Operator
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-2
+ Map Reduce
+ Alias -> Map Operator Tree:
+#### A masked pattern was here ####
+ TableScan
+ Union
+ Select Operator
+ SELECT * : (no compute)
+ Group By Operator
+ aggregations:
+ expr: count(1)
+ bucketGroup: false
+ mode: hash
+ outputColumnNames: _col0
+ Reduce Output Operator
+ sort order:
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: bigint
+#### A masked pattern was here ####
+ TableScan
+ Union
+ Select Operator
+ SELECT * : (no compute)
+ Group By Operator
+ aggregations:
+ expr: count(1)
+ bucketGroup: false
+ mode: hash
+ outputColumnNames: _col0
+ Reduce Output Operator
+ sort order:
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: bigint
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ mode: mergepartial
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: bigint
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-4
+ Map Reduce
+ Alias -> Map Operator Tree:
+ a
+ TableScan
+ alias: a
+ Filter Operator
+ predicate:
+ expr: ((key = 2) or (key = 3))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: int
+ sort order: +
+ Map-reduce partition columns:
+ expr: key
+ type: int
+ tag: 0
+ b
+ TableScan
+ alias: b
+ Filter Operator
+ predicate:
+ expr: ((key = 2) or (key = 3))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: int
+ sort order: +
+ Map-reduce partition columns:
+ expr: key
+ type: int
+ tag: 1
+ Reduce Operator Tree:
+ Join Operator
+ condition map:
+ Right Outer Join0 to 1
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ Select Operator
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: SELECT count(1) FROM T1 a RIGHT OUTER JOIN T2 b ON a.key = b.key
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+PREHOOK: Input: default@t2
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT count(1) FROM T1 a RIGHT OUTER JOIN T2 b ON a.key = b.key
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t1
+POSTHOOK: Input: default@t2
+#### A masked pattern was here ####
+POSTHOOK: Lineage: t1.key EXPRESSION [(tmpt1)tmpt1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t1.val SIMPLE [(tmpt1)tmpt1.FieldSchema(name:val, type:string, comment:null), ]
+POSTHOOK: Lineage: t2.key EXPRESSION [(tmpt2)tmpt2.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t2.val SIMPLE [(tmpt2)tmpt2.FieldSchema(name:val, type:string, comment:null), ]
+8
Index: ql/src/test/results/clientpositive/skewjoinopt10.q.out
===================================================================
--- ql/src/test/results/clientpositive/skewjoinopt10.q.out (revision 0)
+++ ql/src/test/results/clientpositive/skewjoinopt10.q.out (working copy)
@@ -0,0 +1,305 @@
+PREHOOK: query: CREATE TABLE T1(key STRING, value STRING) STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: CREATE TABLE T1(key STRING, value STRING) STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@T1
+PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1
+PREHOOK: type: LOAD
+PREHOOK: Output: default@t1
+POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@t1
+PREHOOK: query: drop table array_valued_T1
+PREHOOK: type: DROPTABLE
+POSTHOOK: query: drop table array_valued_T1
+POSTHOOK: type: DROPTABLE
+PREHOOK: query: create table array_valued_T1 (key string, value array) SKEWED BY (key) ON ((8))
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: create table array_valued_T1 (key string, value array) SKEWED BY (key) ON ((8))
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@array_valued_T1
+PREHOOK: query: insert overwrite table array_valued_T1 select key, array(value) from T1
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+PREHOOK: Output: default@array_valued_t1
+POSTHOOK: query: insert overwrite table array_valued_T1 select key, array(value) from T1
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t1
+POSTHOOK: Output: default@array_valued_t1
+POSTHOOK: Lineage: array_valued_t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: array_valued_t1.value EXPRESSION [(t1)t1.FieldSchema(name:value, type:string, comment:null), ]
+PREHOOK: query: -- This test is to verify the skew join compile optimization when the join is followed by a lateral view
+explain
+select * from (select a.key as key, b.value as array_val from T1 a join array_valued_T1 b on a.key=b.key) i lateral view explode (array_val) c as val
+PREHOOK: type: QUERY
+POSTHOOK: query: -- This test is to verify the skew join compile optimization when the join is followed by a lateral view
+explain
+select * from (select a.key as key, b.value as array_val from T1 a join array_valued_T1 b on a.key=b.key) i lateral view explode (array_val) c as val
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: array_valued_t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: array_valued_t1.value EXPRESSION [(t1)t1.FieldSchema(name:value, type:string, comment:null), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_LATERAL_VIEW (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTION explode (TOK_TABLE_OR_COL array_val)) val (TOK_TABALIAS c))) (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME T1) a) (TOK_TABREF (TOK_TABNAME array_valued_T1) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key) key) (TOK_SELEXPR (. (TOK_TABLE_OR_COL b) value) array_val)))) i))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-2 depends on stages: Stage-1, Stage-3
+ Stage-3 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ subquery1:a
+ TableScan
+ alias: a
+ Filter Operator
+ predicate:
+ expr: (not (key = '8'))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ tag: 0
+ value expressions:
+ expr: key
+ type: string
+ subquery1:b
+ TableScan
+ alias: b
+ Filter Operator
+ predicate:
+ expr: (not (key = '8'))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ tag: 1
+ value expressions:
+ expr: value
+ type: array
+ Reduce Operator Tree:
+ Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {VALUE._col0}
+ 1 {VALUE._col1}
+ handleSkewJoin: false
+ outputColumnNames: _col0, _col5
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col5
+ type: array
+ outputColumnNames: _col0, _col1
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-2
+ Map Reduce
+ Alias -> Map Operator Tree:
+#### A masked pattern was here ####
+ TableScan
+ Union
+ Select Operator
+ SELECT * : (no compute)
+ Lateral View Forward
+ Select Operator
+ SELECT * : (no compute)
+ Lateral View Join Operator
+ outputColumnNames: _col0, _col1, _col2
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: array
+ expr: _col2
+ type: string
+ outputColumnNames: _col0, _col1, _col2
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ Select Operator
+ expressions:
+ expr: _col1
+ type: array
+ outputColumnNames: _col0
+ UDTF Operator
+ function name: explode
+ Lateral View Join Operator
+ outputColumnNames: _col0, _col1, _col2
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: array
+ expr: _col2
+ type: string
+ outputColumnNames: _col0, _col1, _col2
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+#### A masked pattern was here ####
+ TableScan
+ Union
+ Select Operator
+ SELECT * : (no compute)
+ Lateral View Forward
+ Select Operator
+ SELECT * : (no compute)
+ Lateral View Join Operator
+ outputColumnNames: _col0, _col1, _col2
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: array
+ expr: _col2
+ type: string
+ outputColumnNames: _col0, _col1, _col2
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ Select Operator
+ expressions:
+ expr: _col1
+ type: array
+ outputColumnNames: _col0
+ UDTF Operator
+ function name: explode
+ Lateral View Join Operator
+ outputColumnNames: _col0, _col1, _col2
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: array
+ expr: _col2
+ type: string
+ outputColumnNames: _col0, _col1, _col2
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-3
+ Map Reduce
+ Alias -> Map Operator Tree:
+ i:a
+ TableScan
+ alias: a
+ Filter Operator
+ predicate:
+ expr: (key = '8')
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ tag: 0
+ value expressions:
+ expr: key
+ type: string
+ i:b
+ TableScan
+ alias: b
+ Filter Operator
+ predicate:
+ expr: (key = '8')
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ tag: 1
+ value expressions:
+ expr: value
+ type: array
+ Reduce Operator Tree:
+ Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {VALUE._col0}
+ 1 {VALUE._col1}
+ handleSkewJoin: false
+ outputColumnNames: _col0, _col5
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col5
+ type: array
+ outputColumnNames: _col0, _col1
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: select * from (select a.key as key, b.value as array_val from T1 a join array_valued_T1 b on a.key=b.key) i lateral view explode (array_val) c as val
+PREHOOK: type: QUERY
+PREHOOK: Input: default@array_valued_t1
+PREHOOK: Input: default@t1
+#### A masked pattern was here ####
+POSTHOOK: query: select * from (select a.key as key, b.value as array_val from T1 a join array_valued_T1 b on a.key=b.key) i lateral view explode (array_val) c as val
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@array_valued_t1
+POSTHOOK: Input: default@t1
+#### A masked pattern was here ####
+POSTHOOK: Lineage: array_valued_t1.key SIMPLE [(t1)t1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: array_valued_t1.value EXPRESSION [(t1)t1.FieldSchema(name:value, type:string, comment:null), ]
+1 ["11"] 11
+2 ["12"] 12
+3 ["13"] 13
+7 ["17"] 17
+8 ["18"] 18
+8 ["28"] 28
+8 ["18"] 18
+8 ["28"] 28
Index: ql/src/test/results/clientpositive/skewjoinopt5.q.out
===================================================================
--- ql/src/test/results/clientpositive/skewjoinopt5.q.out (revision 0)
+++ ql/src/test/results/clientpositive/skewjoinopt5.q.out (working copy)
@@ -0,0 +1,239 @@
+PREHOOK: query: CREATE TABLE T1(key STRING, val STRING)
+SKEWED BY (key, val) ON ((2, 12)) STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: CREATE TABLE T1(key STRING, val STRING)
+SKEWED BY (key, val) ON ((2, 12)) STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@T1
+PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1
+PREHOOK: type: LOAD
+PREHOOK: Output: default@t1
+POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@t1
+PREHOOK: query: CREATE TABLE T2(key STRING, val STRING)
+SKEWED BY (key) ON ((3)) STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: CREATE TABLE T2(key STRING, val STRING)
+SKEWED BY (key) ON ((3)) STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@T2
+PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE T2
+PREHOOK: type: LOAD
+PREHOOK: Output: default@t2
+POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE T2
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@t2
+PREHOOK: query: -- One of the tables is skewed by 2 columns, and the other table is
+-- skewed by one column. Ths join is performed on the first skewed column
+
+EXPLAIN
+SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key
+PREHOOK: type: QUERY
+POSTHOOK: query: -- One of the tables is skewed by 2 columns, and the other table is
+-- skewed by one column. Ths join is performed on the first skewed column
+
+EXPLAIN
+SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key
+POSTHOOK: type: QUERY
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME T1) a) (TOK_TABREF (TOK_TABNAME T2) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_ALLCOLREF (TOK_TABNAME a))) (TOK_SELEXPR (TOK_ALLCOLREF (TOK_TABNAME b))))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-2 depends on stages: Stage-1, Stage-4
+ Stage-4 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ subquery1:a
+ TableScan
+ alias: a
+ Filter Operator
+ predicate:
+ expr: (not ((key = '2') or (key = '3')))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ tag: 0
+ value expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ subquery1:b
+ TableScan
+ alias: b
+ Filter Operator
+ predicate:
+ expr: (not ((key = '2') or (key = '3')))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ tag: 1
+ value expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ Reduce Operator Tree:
+ Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {VALUE._col0} {VALUE._col1}
+ 1 {VALUE._col0} {VALUE._col1}
+ handleSkewJoin: false
+ outputColumnNames: _col0, _col1, _col4, _col5
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col4
+ type: string
+ expr: _col5
+ type: string
+ outputColumnNames: _col0, _col1, _col2, _col3
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-2
+ Map Reduce
+ Alias -> Map Operator Tree:
+#### A masked pattern was here ####
+ TableScan
+ Union
+ Select Operator
+ SELECT * : (no compute)
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+#### A masked pattern was here ####
+ TableScan
+ Union
+ Select Operator
+ SELECT * : (no compute)
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-4
+ Map Reduce
+ Alias -> Map Operator Tree:
+ a
+ TableScan
+ alias: a
+ Filter Operator
+ predicate:
+ expr: ((key = '2') or (key = '3'))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ tag: 0
+ value expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ b
+ TableScan
+ alias: b
+ Filter Operator
+ predicate:
+ expr: ((key = '2') or (key = '3'))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ tag: 1
+ value expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ Reduce Operator Tree:
+ Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {VALUE._col0} {VALUE._col1}
+ 1 {VALUE._col0} {VALUE._col1}
+ handleSkewJoin: false
+ outputColumnNames: _col0, _col1, _col4, _col5
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col4
+ type: string
+ expr: _col5
+ type: string
+ outputColumnNames: _col0, _col1, _col2, _col3
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+PREHOOK: Input: default@t2
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t1
+POSTHOOK: Input: default@t2
+#### A masked pattern was here ####
+8 18 8 18
+8 18 8 18
+8 28 8 18
+8 28 8 18
+2 12 2 22
+3 13 3 13
Index: ql/src/test/results/clientpositive/skewjoinopt17.q.out
===================================================================
--- ql/src/test/results/clientpositive/skewjoinopt17.q.out (revision 0)
+++ ql/src/test/results/clientpositive/skewjoinopt17.q.out (working copy)
@@ -0,0 +1,512 @@
+PREHOOK: query: CREATE TABLE T1(key STRING, val STRING)
+SKEWED BY (key, val) ON ((2, 12)) STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: CREATE TABLE T1(key STRING, val STRING)
+SKEWED BY (key, val) ON ((2, 12)) STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@T1
+PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1
+PREHOOK: type: LOAD
+PREHOOK: Output: default@t1
+POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@t1
+PREHOOK: query: CREATE TABLE T2(key STRING, val STRING)
+SKEWED BY (key) ON ((2)) STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: CREATE TABLE T2(key STRING, val STRING)
+SKEWED BY (key) ON ((2)) STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@T2
+PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE T2
+PREHOOK: type: LOAD
+PREHOOK: Output: default@t2
+POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE T2
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@t2
+PREHOOK: query: -- One of the tables is skewed by 2 columns, and the other table is
+-- skewed by one column. Ths join is performed on the first skewed column
+-- The skewed value for the jon key is common to both the tables.
+-- In this case, the skewed join value is not repeated in the filter.
+
+EXPLAIN
+SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key
+PREHOOK: type: QUERY
+POSTHOOK: query: -- One of the tables is skewed by 2 columns, and the other table is
+-- skewed by one column. Ths join is performed on the first skewed column
+-- The skewed value for the jon key is common to both the tables.
+-- In this case, the skewed join value is not repeated in the filter.
+
+EXPLAIN
+SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key
+POSTHOOK: type: QUERY
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME T1) a) (TOK_TABREF (TOK_TABNAME T2) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_ALLCOLREF (TOK_TABNAME a))) (TOK_SELEXPR (TOK_ALLCOLREF (TOK_TABNAME b))))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-2 depends on stages: Stage-1, Stage-4
+ Stage-4 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ subquery1:a
+ TableScan
+ alias: a
+ Filter Operator
+ predicate:
+ expr: (not (key = '2'))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ tag: 0
+ value expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ subquery1:b
+ TableScan
+ alias: b
+ Filter Operator
+ predicate:
+ expr: (not (key = '2'))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ tag: 1
+ value expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ Reduce Operator Tree:
+ Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {VALUE._col0} {VALUE._col1}
+ 1 {VALUE._col0} {VALUE._col1}
+ handleSkewJoin: false
+ outputColumnNames: _col0, _col1, _col4, _col5
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col4
+ type: string
+ expr: _col5
+ type: string
+ outputColumnNames: _col0, _col1, _col2, _col3
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-2
+ Map Reduce
+ Alias -> Map Operator Tree:
+#### A masked pattern was here ####
+ TableScan
+ Union
+ Select Operator
+ SELECT * : (no compute)
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+#### A masked pattern was here ####
+ TableScan
+ Union
+ Select Operator
+ SELECT * : (no compute)
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-4
+ Map Reduce
+ Alias -> Map Operator Tree:
+ a
+ TableScan
+ alias: a
+ Filter Operator
+ predicate:
+ expr: (key = '2')
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ tag: 0
+ value expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ b
+ TableScan
+ alias: b
+ Filter Operator
+ predicate:
+ expr: (key = '2')
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ tag: 1
+ value expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ Reduce Operator Tree:
+ Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {VALUE._col0} {VALUE._col1}
+ 1 {VALUE._col0} {VALUE._col1}
+ handleSkewJoin: false
+ outputColumnNames: _col0, _col1, _col4, _col5
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col4
+ type: string
+ expr: _col5
+ type: string
+ outputColumnNames: _col0, _col1, _col2, _col3
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+PREHOOK: Input: default@t2
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t1
+POSTHOOK: Input: default@t2
+#### A masked pattern was here ####
+3 13 3 13
+8 18 8 18
+8 18 8 18
+8 28 8 18
+8 28 8 18
+2 12 2 22
+PREHOOK: query: DROP TABLE T1
+PREHOOK: type: DROPTABLE
+PREHOOK: Input: default@t1
+PREHOOK: Output: default@t1
+POSTHOOK: query: DROP TABLE T1
+POSTHOOK: type: DROPTABLE
+POSTHOOK: Input: default@t1
+POSTHOOK: Output: default@t1
+PREHOOK: query: DROP TABLE T2
+PREHOOK: type: DROPTABLE
+PREHOOK: Input: default@t2
+PREHOOK: Output: default@t2
+POSTHOOK: query: DROP TABLE T2
+POSTHOOK: type: DROPTABLE
+POSTHOOK: Input: default@t2
+POSTHOOK: Output: default@t2
+PREHOOK: query: CREATE TABLE T1(key STRING, val STRING)
+SKEWED BY (key, val) ON ((2, 12)) STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: CREATE TABLE T1(key STRING, val STRING)
+SKEWED BY (key, val) ON ((2, 12)) STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@T1
+PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1
+PREHOOK: type: LOAD
+PREHOOK: Output: default@t1
+POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@t1
+PREHOOK: query: CREATE TABLE T2(key STRING, val STRING)
+SKEWED BY (key) ON ((2)) STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: CREATE TABLE T2(key STRING, val STRING)
+SKEWED BY (key) ON ((2)) STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@T2
+PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE T2
+PREHOOK: type: LOAD
+PREHOOK: Output: default@t2
+POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE T2
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@t2
+PREHOOK: query: -- One of the tables is skewed by 2 columns, and the other table is
+-- skewed by one column. Ths join is performed on the both the columns
+-- In this case, the skewed join value is repeated in the filter.
+
+EXPLAIN
+SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key and a.val = b.val
+PREHOOK: type: QUERY
+POSTHOOK: query: -- One of the tables is skewed by 2 columns, and the other table is
+-- skewed by one column. Ths join is performed on the both the columns
+-- In this case, the skewed join value is repeated in the filter.
+
+EXPLAIN
+SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key and a.val = b.val
+POSTHOOK: type: QUERY
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME T1) a) (TOK_TABREF (TOK_TABNAME T2) b) (and (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)) (= (. (TOK_TABLE_OR_COL a) val) (. (TOK_TABLE_OR_COL b) val))))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_ALLCOLREF (TOK_TABNAME a))) (TOK_SELEXPR (TOK_ALLCOLREF (TOK_TABNAME b))))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-2 depends on stages: Stage-1, Stage-4
+ Stage-4 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ subquery1:a
+ TableScan
+ alias: a
+ Filter Operator
+ predicate:
+ expr: (not (((key = '2') and (val = '12')) or (key = '2')))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ sort order: ++
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ tag: 0
+ value expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ subquery1:b
+ TableScan
+ alias: b
+ Filter Operator
+ predicate:
+ expr: (not (((key = '2') and (val = '12')) or (key = '2')))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ sort order: ++
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ tag: 1
+ value expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ Reduce Operator Tree:
+ Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {VALUE._col0} {VALUE._col1}
+ 1 {VALUE._col0} {VALUE._col1}
+ handleSkewJoin: false
+ outputColumnNames: _col0, _col1, _col4, _col5
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col4
+ type: string
+ expr: _col5
+ type: string
+ outputColumnNames: _col0, _col1, _col2, _col3
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-2
+ Map Reduce
+ Alias -> Map Operator Tree:
+#### A masked pattern was here ####
+ TableScan
+ Union
+ Select Operator
+ SELECT * : (no compute)
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+#### A masked pattern was here ####
+ TableScan
+ Union
+ Select Operator
+ SELECT * : (no compute)
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-4
+ Map Reduce
+ Alias -> Map Operator Tree:
+ a
+ TableScan
+ alias: a
+ Filter Operator
+ predicate:
+ expr: (((key = '2') and (val = '12')) or (key = '2'))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ sort order: ++
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ tag: 0
+ value expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ b
+ TableScan
+ alias: b
+ Filter Operator
+ predicate:
+ expr: (((key = '2') and (val = '12')) or (key = '2'))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ sort order: ++
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ tag: 1
+ value expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ Reduce Operator Tree:
+ Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {VALUE._col0} {VALUE._col1}
+ 1 {VALUE._col0} {VALUE._col1}
+ handleSkewJoin: false
+ outputColumnNames: _col0, _col1, _col4, _col5
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col4
+ type: string
+ expr: _col5
+ type: string
+ outputColumnNames: _col0, _col1, _col2, _col3
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+PREHOOK: query: SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key and a.val = b.val
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+PREHOOK: Input: default@t2
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key and a.val = b.val
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t1
+POSTHOOK: Input: default@t2
+#### A masked pattern was here ####
+3 13 3 13
+8 18 8 18
+8 18 8 18
Index: ql/src/test/results/clientpositive/skewjoinopt12.q.out
===================================================================
--- ql/src/test/results/clientpositive/skewjoinopt12.q.out (revision 0)
+++ ql/src/test/results/clientpositive/skewjoinopt12.q.out (working copy)
@@ -0,0 +1,251 @@
+PREHOOK: query: CREATE TABLE T1(key STRING, val STRING)
+SKEWED BY (key, val) ON ((2, 12), (8, 18)) STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: CREATE TABLE T1(key STRING, val STRING)
+SKEWED BY (key, val) ON ((2, 12), (8, 18)) STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@T1
+PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1
+PREHOOK: type: LOAD
+PREHOOK: Output: default@t1
+POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@t1
+PREHOOK: query: CREATE TABLE T2(key STRING, val STRING)
+SKEWED BY (key, val) ON ((3, 13), (8, 18)) STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: CREATE TABLE T2(key STRING, val STRING)
+SKEWED BY (key, val) ON ((3, 13), (8, 18)) STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@T2
+PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE T2
+PREHOOK: type: LOAD
+PREHOOK: Output: default@t2
+POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE T2
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@t2
+PREHOOK: query: -- Both the join tables are skewed by 2 keys, and one of the skewed values
+-- is common to both the tables. The join key matches the skewed key set.
+
+EXPLAIN
+SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key and a.val = b.val
+PREHOOK: type: QUERY
+POSTHOOK: query: -- Both the join tables are skewed by 2 keys, and one of the skewed values
+-- is common to both the tables. The join key matches the skewed key set.
+
+EXPLAIN
+SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key and a.val = b.val
+POSTHOOK: type: QUERY
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME T1) a) (TOK_TABREF (TOK_TABNAME T2) b) (and (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)) (= (. (TOK_TABLE_OR_COL a) val) (. (TOK_TABLE_OR_COL b) val))))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_ALLCOLREF (TOK_TABNAME a))) (TOK_SELEXPR (TOK_ALLCOLREF (TOK_TABNAME b))))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-2 depends on stages: Stage-1, Stage-4
+ Stage-4 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ subquery1:a
+ TableScan
+ alias: a
+ Filter Operator
+ predicate:
+ expr: (not ((((key = '2') and (val = '12')) or ((key = '8') and (val = '18'))) or ((key = '3') and (val = '13'))))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ sort order: ++
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ tag: 0
+ value expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ subquery1:b
+ TableScan
+ alias: b
+ Filter Operator
+ predicate:
+ expr: (not ((((key = '2') and (val = '12')) or ((key = '8') and (val = '18'))) or ((key = '3') and (val = '13'))))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ sort order: ++
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ tag: 1
+ value expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ Reduce Operator Tree:
+ Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {VALUE._col0} {VALUE._col1}
+ 1 {VALUE._col0} {VALUE._col1}
+ handleSkewJoin: false
+ outputColumnNames: _col0, _col1, _col4, _col5
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col4
+ type: string
+ expr: _col5
+ type: string
+ outputColumnNames: _col0, _col1, _col2, _col3
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-2
+ Map Reduce
+ Alias -> Map Operator Tree:
+#### A masked pattern was here ####
+ TableScan
+ Union
+ Select Operator
+ SELECT * : (no compute)
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+#### A masked pattern was here ####
+ TableScan
+ Union
+ Select Operator
+ SELECT * : (no compute)
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-4
+ Map Reduce
+ Alias -> Map Operator Tree:
+ a
+ TableScan
+ alias: a
+ Filter Operator
+ predicate:
+ expr: ((((key = '2') and (val = '12')) or ((key = '8') and (val = '18'))) or ((key = '3') and (val = '13')))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ sort order: ++
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ tag: 0
+ value expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ b
+ TableScan
+ alias: b
+ Filter Operator
+ predicate:
+ expr: ((((key = '2') and (val = '12')) or ((key = '8') and (val = '18'))) or ((key = '3') and (val = '13')))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ sort order: ++
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ tag: 1
+ value expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ Reduce Operator Tree:
+ Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {VALUE._col0} {VALUE._col1}
+ 1 {VALUE._col0} {VALUE._col1}
+ handleSkewJoin: false
+ outputColumnNames: _col0, _col1, _col4, _col5
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col4
+ type: string
+ expr: _col5
+ type: string
+ outputColumnNames: _col0, _col1, _col2, _col3
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+PREHOOK: query: SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key and a.val = b.val
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+PREHOOK: Input: default@t2
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key and a.val = b.val
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t1
+POSTHOOK: Input: default@t2
+#### A masked pattern was here ####
+3 13 3 13
+8 18 8 18
+8 18 8 18
Index: ql/src/test/results/clientpositive/skewjoinopt7.q.out
===================================================================
--- ql/src/test/results/clientpositive/skewjoinopt7.q.out (revision 0)
+++ ql/src/test/results/clientpositive/skewjoinopt7.q.out (working copy)
@@ -0,0 +1,301 @@
+PREHOOK: query: CREATE TABLE T1(key STRING, val STRING)
+SKEWED BY (key) ON ((2), (8)) STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: CREATE TABLE T1(key STRING, val STRING)
+SKEWED BY (key) ON ((2), (8)) STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@T1
+PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1
+PREHOOK: type: LOAD
+PREHOOK: Output: default@t1
+POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@t1
+PREHOOK: query: CREATE TABLE T2(key STRING, val STRING)
+SKEWED BY (key) ON ((3), (8)) STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: CREATE TABLE T2(key STRING, val STRING)
+SKEWED BY (key) ON ((3), (8)) STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@T2
+PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE T2
+PREHOOK: type: LOAD
+PREHOOK: Output: default@t2
+POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE T2
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@t2
+PREHOOK: query: CREATE TABLE T3(key STRING, val STRING) STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: CREATE TABLE T3(key STRING, val STRING) STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@T3
+PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T3.txt' INTO TABLE T3
+PREHOOK: type: LOAD
+PREHOOK: Output: default@t3
+POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T3.txt' INTO TABLE T3
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@t3
+PREHOOK: query: -- This test is for validating skewed join compile time optimization for more than
+-- 2 tables. The join key is the same, and so a 3-way join would be performed.
+-- 2 of the 3 tables are skewed on the join key
+EXPLAIN
+SELECT a.*, b.*, c.* FROM T1 a JOIN T2 b ON a.key = b.key JOIN T3 c on a.key = c.key
+PREHOOK: type: QUERY
+POSTHOOK: query: -- This test is for validating skewed join compile time optimization for more than
+-- 2 tables. The join key is the same, and so a 3-way join would be performed.
+-- 2 of the 3 tables are skewed on the join key
+EXPLAIN
+SELECT a.*, b.*, c.* FROM T1 a JOIN T2 b ON a.key = b.key JOIN T3 c on a.key = c.key
+POSTHOOK: type: QUERY
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_JOIN (TOK_TABREF (TOK_TABNAME T1) a) (TOK_TABREF (TOK_TABNAME T2) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key))) (TOK_TABREF (TOK_TABNAME T3) c) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL c) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_ALLCOLREF (TOK_TABNAME a))) (TOK_SELEXPR (TOK_ALLCOLREF (TOK_TABNAME b))) (TOK_SELEXPR (TOK_ALLCOLREF (TOK_TABNAME c))))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-2 depends on stages: Stage-1, Stage-5
+ Stage-5 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ subquery1:a
+ TableScan
+ alias: a
+ Filter Operator
+ predicate:
+ expr: (not (((key = '2') or (key = '8')) or (key = '3')))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ tag: 0
+ value expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ subquery1:b
+ TableScan
+ alias: b
+ Filter Operator
+ predicate:
+ expr: (not (((key = '2') or (key = '8')) or (key = '3')))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ tag: 1
+ value expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ subquery1:c
+ TableScan
+ alias: c
+ Filter Operator
+ predicate:
+ expr: (not (((key = '2') or (key = '8')) or (key = '3')))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ tag: 2
+ value expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ Reduce Operator Tree:
+ Join Operator
+ condition map:
+ Inner Join 0 to 1
+ Inner Join 0 to 2
+ condition expressions:
+ 0 {VALUE._col0} {VALUE._col1}
+ 1 {VALUE._col0} {VALUE._col1}
+ 2 {VALUE._col0} {VALUE._col1}
+ handleSkewJoin: false
+ outputColumnNames: _col0, _col1, _col4, _col5, _col8, _col9
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col4
+ type: string
+ expr: _col5
+ type: string
+ expr: _col8
+ type: string
+ expr: _col9
+ type: string
+ outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-2
+ Map Reduce
+ Alias -> Map Operator Tree:
+#### A masked pattern was here ####
+ TableScan
+ Union
+ Select Operator
+ SELECT * : (no compute)
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+#### A masked pattern was here ####
+ TableScan
+ Union
+ Select Operator
+ SELECT * : (no compute)
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-5
+ Map Reduce
+ Alias -> Map Operator Tree:
+ a
+ TableScan
+ alias: a
+ Filter Operator
+ predicate:
+ expr: (((key = '2') or (key = '8')) or (key = '3'))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ tag: 0
+ value expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ b
+ TableScan
+ alias: b
+ Filter Operator
+ predicate:
+ expr: (((key = '2') or (key = '8')) or (key = '3'))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ tag: 1
+ value expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ c
+ TableScan
+ alias: c
+ Filter Operator
+ predicate:
+ expr: (((key = '2') or (key = '8')) or (key = '3'))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ tag: 2
+ value expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ Reduce Operator Tree:
+ Join Operator
+ condition map:
+ Inner Join 0 to 1
+ Inner Join 0 to 2
+ condition expressions:
+ 0 {VALUE._col0} {VALUE._col1}
+ 1 {VALUE._col0} {VALUE._col1}
+ 2 {VALUE._col0} {VALUE._col1}
+ handleSkewJoin: false
+ outputColumnNames: _col0, _col1, _col4, _col5, _col8, _col9
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col4
+ type: string
+ expr: _col5
+ type: string
+ expr: _col8
+ type: string
+ expr: _col9
+ type: string
+ outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: SELECT a.*, b.*, c.* FROM T1 a JOIN T2 b ON a.key = b.key JOIN T3 c on a.key = c.key
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+PREHOOK: Input: default@t2
+PREHOOK: Input: default@t3
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT a.*, b.*, c.* FROM T1 a JOIN T2 b ON a.key = b.key JOIN T3 c on a.key = c.key
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t1
+POSTHOOK: Input: default@t2
+POSTHOOK: Input: default@t3
+#### A masked pattern was here ####
+2 12 2 22 2 12
Index: ql/src/test/results/clientpositive/skewjoinopt19.q.out
===================================================================
--- ql/src/test/results/clientpositive/skewjoinopt19.q.out (revision 0)
+++ ql/src/test/results/clientpositive/skewjoinopt19.q.out (working copy)
@@ -0,0 +1,239 @@
+PREHOOK: query: CREATE TABLE T1(key STRING, val STRING)
+CLUSTERED BY (key) INTO 4 BUCKETS
+SKEWED BY (key) ON ((2)) STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: CREATE TABLE T1(key STRING, val STRING)
+CLUSTERED BY (key) INTO 4 BUCKETS
+SKEWED BY (key) ON ((2)) STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@T1
+PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1
+PREHOOK: type: LOAD
+PREHOOK: Output: default@t1
+POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@t1
+PREHOOK: query: CREATE TABLE T2(key STRING, val STRING) STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: CREATE TABLE T2(key STRING, val STRING) STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@T2
+PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE T2
+PREHOOK: type: LOAD
+PREHOOK: Output: default@t2
+POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE T2
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@t2
+PREHOOK: query: -- add a test where the skewed key is also the bucketized key
+-- it should not matter, and the compile time skewed join
+-- optimization is performed
+EXPLAIN
+SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key
+PREHOOK: type: QUERY
+POSTHOOK: query: -- add a test where the skewed key is also the bucketized key
+-- it should not matter, and the compile time skewed join
+-- optimization is performed
+EXPLAIN
+SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key
+POSTHOOK: type: QUERY
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME T1) a) (TOK_TABREF (TOK_TABNAME T2) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_ALLCOLREF (TOK_TABNAME a))) (TOK_SELEXPR (TOK_ALLCOLREF (TOK_TABNAME b))))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-2 depends on stages: Stage-1, Stage-4
+ Stage-4 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ subquery1:a
+ TableScan
+ alias: a
+ Filter Operator
+ predicate:
+ expr: (not (key = '2'))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ tag: 0
+ value expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ subquery1:b
+ TableScan
+ alias: b
+ Filter Operator
+ predicate:
+ expr: (not (key = '2'))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ tag: 1
+ value expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ Reduce Operator Tree:
+ Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {VALUE._col0} {VALUE._col1}
+ 1 {VALUE._col0} {VALUE._col1}
+ handleSkewJoin: false
+ outputColumnNames: _col0, _col1, _col4, _col5
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col4
+ type: string
+ expr: _col5
+ type: string
+ outputColumnNames: _col0, _col1, _col2, _col3
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-2
+ Map Reduce
+ Alias -> Map Operator Tree:
+#### A masked pattern was here ####
+ TableScan
+ Union
+ Select Operator
+ SELECT * : (no compute)
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+#### A masked pattern was here ####
+ TableScan
+ Union
+ Select Operator
+ SELECT * : (no compute)
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-4
+ Map Reduce
+ Alias -> Map Operator Tree:
+ a
+ TableScan
+ alias: a
+ Filter Operator
+ predicate:
+ expr: (key = '2')
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ tag: 0
+ value expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ b
+ TableScan
+ alias: b
+ Filter Operator
+ predicate:
+ expr: (key = '2')
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ tag: 1
+ value expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ Reduce Operator Tree:
+ Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {VALUE._col0} {VALUE._col1}
+ 1 {VALUE._col0} {VALUE._col1}
+ handleSkewJoin: false
+ outputColumnNames: _col0, _col1, _col4, _col5
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col4
+ type: string
+ expr: _col5
+ type: string
+ outputColumnNames: _col0, _col1, _col2, _col3
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+PREHOOK: Input: default@t2
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t1
+POSTHOOK: Input: default@t2
+#### A masked pattern was here ####
+3 13 3 13
+8 18 8 18
+8 18 8 18
+8 28 8 18
+8 28 8 18
+2 12 2 22
Index: ql/src/test/results/clientpositive/skewjoinopt2.q.out
===================================================================
--- ql/src/test/results/clientpositive/skewjoinopt2.q.out (revision 0)
+++ ql/src/test/results/clientpositive/skewjoinopt2.q.out (working copy)
@@ -0,0 +1,981 @@
+PREHOOK: query: CREATE TABLE T1(key STRING, val STRING)
+SKEWED BY (key) ON ((2), (7)) STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: CREATE TABLE T1(key STRING, val STRING)
+SKEWED BY (key) ON ((2), (7)) STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@T1
+PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1
+PREHOOK: type: LOAD
+PREHOOK: Output: default@t1
+POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@t1
+PREHOOK: query: CREATE TABLE T2(key STRING, val STRING)
+SKEWED BY (key) ON ((3), (8)) STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: CREATE TABLE T2(key STRING, val STRING)
+SKEWED BY (key) ON ((3), (8)) STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@T2
+PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE T2
+PREHOOK: type: LOAD
+PREHOOK: Output: default@t2
+POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE T2
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@t2
+PREHOOK: query: -- a simple query with skew on both the tables on the join key
+-- multiple skew values are present for the skewed keys
+-- but the skewed values do not overlap.
+-- The join values are a superset of the skewed keys.
+
+EXPLAIN
+SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key and a.val = b.val
+PREHOOK: type: QUERY
+POSTHOOK: query: -- a simple query with skew on both the tables on the join key
+-- multiple skew values are present for the skewed keys
+-- but the skewed values do not overlap.
+-- The join values are a superset of the skewed keys.
+
+EXPLAIN
+SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key and a.val = b.val
+POSTHOOK: type: QUERY
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME T1) a) (TOK_TABREF (TOK_TABNAME T2) b) (and (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)) (= (. (TOK_TABLE_OR_COL a) val) (. (TOK_TABLE_OR_COL b) val))))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_ALLCOLREF (TOK_TABNAME a))) (TOK_SELEXPR (TOK_ALLCOLREF (TOK_TABNAME b))))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-2 depends on stages: Stage-1, Stage-4
+ Stage-4 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ subquery1:a
+ TableScan
+ alias: a
+ Filter Operator
+ predicate:
+ expr: (not ((((key = '2') or (key = '7')) or (key = '3')) or (key = '8')))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ sort order: ++
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ tag: 0
+ value expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ subquery1:b
+ TableScan
+ alias: b
+ Filter Operator
+ predicate:
+ expr: (not ((((key = '2') or (key = '7')) or (key = '3')) or (key = '8')))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ sort order: ++
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ tag: 1
+ value expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ Reduce Operator Tree:
+ Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {VALUE._col0} {VALUE._col1}
+ 1 {VALUE._col0} {VALUE._col1}
+ handleSkewJoin: false
+ outputColumnNames: _col0, _col1, _col4, _col5
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col4
+ type: string
+ expr: _col5
+ type: string
+ outputColumnNames: _col0, _col1, _col2, _col3
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-2
+ Map Reduce
+ Alias -> Map Operator Tree:
+#### A masked pattern was here ####
+ TableScan
+ Union
+ Select Operator
+ SELECT * : (no compute)
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+#### A masked pattern was here ####
+ TableScan
+ Union
+ Select Operator
+ SELECT * : (no compute)
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-4
+ Map Reduce
+ Alias -> Map Operator Tree:
+ a
+ TableScan
+ alias: a
+ Filter Operator
+ predicate:
+ expr: ((((key = '2') or (key = '7')) or (key = '3')) or (key = '8'))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ sort order: ++
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ tag: 0
+ value expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ b
+ TableScan
+ alias: b
+ Filter Operator
+ predicate:
+ expr: ((((key = '2') or (key = '7')) or (key = '3')) or (key = '8'))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ sort order: ++
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ tag: 1
+ value expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ Reduce Operator Tree:
+ Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {VALUE._col0} {VALUE._col1}
+ 1 {VALUE._col0} {VALUE._col1}
+ handleSkewJoin: false
+ outputColumnNames: _col0, _col1, _col4, _col5
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col4
+ type: string
+ expr: _col5
+ type: string
+ outputColumnNames: _col0, _col1, _col2, _col3
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+PREHOOK: query: SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key and a.val = b.val
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+PREHOOK: Input: default@t2
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key and a.val = b.val
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t1
+POSTHOOK: Input: default@t2
+#### A masked pattern was here ####
+3 13 3 13
+8 18 8 18
+8 18 8 18
+PREHOOK: query: -- test outer joins also
+
+EXPLAIN
+SELECT a.*, b.* FROM T1 a LEFT OUTER JOIN T2 b ON a.key = b.key and a.val = b.val
+PREHOOK: type: QUERY
+POSTHOOK: query: -- test outer joins also
+
+EXPLAIN
+SELECT a.*, b.* FROM T1 a LEFT OUTER JOIN T2 b ON a.key = b.key and a.val = b.val
+POSTHOOK: type: QUERY
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_LEFTOUTERJOIN (TOK_TABREF (TOK_TABNAME T1) a) (TOK_TABREF (TOK_TABNAME T2) b) (and (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)) (= (. (TOK_TABLE_OR_COL a) val) (. (TOK_TABLE_OR_COL b) val))))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_ALLCOLREF (TOK_TABNAME a))) (TOK_SELEXPR (TOK_ALLCOLREF (TOK_TABNAME b))))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-2 depends on stages: Stage-1, Stage-4
+ Stage-4 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ subquery1:a
+ TableScan
+ alias: a
+ Filter Operator
+ predicate:
+ expr: (not ((((key = '2') or (key = '7')) or (key = '3')) or (key = '8')))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ sort order: ++
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ tag: 0
+ value expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ subquery1:b
+ TableScan
+ alias: b
+ Filter Operator
+ predicate:
+ expr: (not ((((key = '2') or (key = '7')) or (key = '3')) or (key = '8')))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ sort order: ++
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ tag: 1
+ value expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ Reduce Operator Tree:
+ Join Operator
+ condition map:
+ Left Outer Join0 to 1
+ condition expressions:
+ 0 {VALUE._col0} {VALUE._col1}
+ 1 {VALUE._col0} {VALUE._col1}
+ handleSkewJoin: false
+ outputColumnNames: _col0, _col1, _col4, _col5
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col4
+ type: string
+ expr: _col5
+ type: string
+ outputColumnNames: _col0, _col1, _col2, _col3
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-2
+ Map Reduce
+ Alias -> Map Operator Tree:
+#### A masked pattern was here ####
+ TableScan
+ Union
+ Select Operator
+ SELECT * : (no compute)
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+#### A masked pattern was here ####
+ TableScan
+ Union
+ Select Operator
+ SELECT * : (no compute)
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-4
+ Map Reduce
+ Alias -> Map Operator Tree:
+ a
+ TableScan
+ alias: a
+ Filter Operator
+ predicate:
+ expr: ((((key = '2') or (key = '7')) or (key = '3')) or (key = '8'))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ sort order: ++
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ tag: 0
+ value expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ b
+ TableScan
+ alias: b
+ Filter Operator
+ predicate:
+ expr: ((((key = '2') or (key = '7')) or (key = '3')) or (key = '8'))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ sort order: ++
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ tag: 1
+ value expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ Reduce Operator Tree:
+ Join Operator
+ condition map:
+ Left Outer Join0 to 1
+ condition expressions:
+ 0 {VALUE._col0} {VALUE._col1}
+ 1 {VALUE._col0} {VALUE._col1}
+ handleSkewJoin: false
+ outputColumnNames: _col0, _col1, _col4, _col5
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col4
+ type: string
+ expr: _col5
+ type: string
+ outputColumnNames: _col0, _col1, _col2, _col3
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+PREHOOK: query: SELECT a.*, b.* FROM T1 a LEFT OUTER JOIN T2 b ON a.key = b.key and a.val = b.val
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+PREHOOK: Input: default@t2
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT a.*, b.* FROM T1 a LEFT OUTER JOIN T2 b ON a.key = b.key and a.val = b.val
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t1
+POSTHOOK: Input: default@t2
+#### A masked pattern was here ####
+1 11 NULL NULL
+2 12 NULL NULL
+3 13 3 13
+7 17 NULL NULL
+8 18 8 18
+8 18 8 18
+8 28 NULL NULL
+PREHOOK: query: -- a group by at the end should not change anything
+
+EXPLAIN
+SELECT a.key, count(1) FROM T1 a JOIN T2 b ON a.key = b.key and a.val = b.val group by a.key
+PREHOOK: type: QUERY
+POSTHOOK: query: -- a group by at the end should not change anything
+
+EXPLAIN
+SELECT a.key, count(1) FROM T1 a JOIN T2 b ON a.key = b.key and a.val = b.val group by a.key
+POSTHOOK: type: QUERY
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME T1) a) (TOK_TABREF (TOK_TABNAME T2) b) (and (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)) (= (. (TOK_TABLE_OR_COL a) val) (. (TOK_TABLE_OR_COL b) val))))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (. (TOK_TABLE_OR_COL a) key))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-2 depends on stages: Stage-1, Stage-4
+ Stage-4 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ subquery1:a
+ TableScan
+ alias: a
+ Filter Operator
+ predicate:
+ expr: (not ((((key = '2') or (key = '7')) or (key = '3')) or (key = '8')))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ sort order: ++
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ tag: 0
+ value expressions:
+ expr: key
+ type: string
+ subquery1:b
+ TableScan
+ alias: b
+ Filter Operator
+ predicate:
+ expr: (not ((((key = '2') or (key = '7')) or (key = '3')) or (key = '8')))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ sort order: ++
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ tag: 1
+ Reduce Operator Tree:
+ Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {VALUE._col0}
+ 1
+ handleSkewJoin: false
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-2
+ Map Reduce
+ Alias -> Map Operator Tree:
+#### A masked pattern was here ####
+ TableScan
+ Union
+ Select Operator
+ SELECT * : (no compute)
+ Group By Operator
+ aggregations:
+ expr: count(1)
+ bucketGroup: false
+ keys:
+ expr: _col0
+ type: string
+ mode: hash
+ outputColumnNames: _col0, _col1
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: _col0
+ type: string
+ tag: -1
+ value expressions:
+ expr: _col1
+ type: bigint
+#### A masked pattern was here ####
+ TableScan
+ Union
+ Select Operator
+ SELECT * : (no compute)
+ Group By Operator
+ aggregations:
+ expr: count(1)
+ bucketGroup: false
+ keys:
+ expr: _col0
+ type: string
+ mode: hash
+ outputColumnNames: _col0, _col1
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: _col0
+ type: string
+ tag: -1
+ value expressions:
+ expr: _col1
+ type: bigint
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ keys:
+ expr: KEY._col0
+ type: string
+ mode: mergepartial
+ outputColumnNames: _col0, _col1
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: bigint
+ outputColumnNames: _col0, _col1
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-4
+ Map Reduce
+ Alias -> Map Operator Tree:
+ a
+ TableScan
+ alias: a
+ Filter Operator
+ predicate:
+ expr: ((((key = '2') or (key = '7')) or (key = '3')) or (key = '8'))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ sort order: ++
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ tag: 0
+ value expressions:
+ expr: key
+ type: string
+ b
+ TableScan
+ alias: b
+ Filter Operator
+ predicate:
+ expr: ((((key = '2') or (key = '7')) or (key = '3')) or (key = '8'))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ sort order: ++
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ tag: 1
+ Reduce Operator Tree:
+ Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {VALUE._col0}
+ 1
+ handleSkewJoin: false
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: SELECT a.key, count(1) FROM T1 a JOIN T2 b ON a.key = b.key and a.val = b.val group by a.key
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+PREHOOK: Input: default@t2
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT a.key, count(1) FROM T1 a JOIN T2 b ON a.key = b.key and a.val = b.val group by a.key
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t1
+POSTHOOK: Input: default@t2
+#### A masked pattern was here ####
+3 1
+8 2
+PREHOOK: query: EXPLAIN
+SELECT a.key, count(1) FROM T1 a LEFT OUTER JOIN T2 b ON a.key = b.key and a.val = b.val group by a.key
+PREHOOK: type: QUERY
+POSTHOOK: query: EXPLAIN
+SELECT a.key, count(1) FROM T1 a LEFT OUTER JOIN T2 b ON a.key = b.key and a.val = b.val group by a.key
+POSTHOOK: type: QUERY
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_LEFTOUTERJOIN (TOK_TABREF (TOK_TABNAME T1) a) (TOK_TABREF (TOK_TABNAME T2) b) (and (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)) (= (. (TOK_TABLE_OR_COL a) val) (. (TOK_TABLE_OR_COL b) val))))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key)) (TOK_SELEXPR (TOK_FUNCTION count 1))) (TOK_GROUPBY (. (TOK_TABLE_OR_COL a) key))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-2 depends on stages: Stage-1, Stage-4
+ Stage-4 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ subquery1:a
+ TableScan
+ alias: a
+ Filter Operator
+ predicate:
+ expr: (not ((((key = '2') or (key = '7')) or (key = '3')) or (key = '8')))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ sort order: ++
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ tag: 0
+ value expressions:
+ expr: key
+ type: string
+ subquery1:b
+ TableScan
+ alias: b
+ Filter Operator
+ predicate:
+ expr: (not ((((key = '2') or (key = '7')) or (key = '3')) or (key = '8')))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ sort order: ++
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ tag: 1
+ Reduce Operator Tree:
+ Join Operator
+ condition map:
+ Left Outer Join0 to 1
+ condition expressions:
+ 0 {VALUE._col0}
+ 1
+ handleSkewJoin: false
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-2
+ Map Reduce
+ Alias -> Map Operator Tree:
+#### A masked pattern was here ####
+ TableScan
+ Union
+ Select Operator
+ SELECT * : (no compute)
+ Group By Operator
+ aggregations:
+ expr: count(1)
+ bucketGroup: false
+ keys:
+ expr: _col0
+ type: string
+ mode: hash
+ outputColumnNames: _col0, _col1
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: _col0
+ type: string
+ tag: -1
+ value expressions:
+ expr: _col1
+ type: bigint
+#### A masked pattern was here ####
+ TableScan
+ Union
+ Select Operator
+ SELECT * : (no compute)
+ Group By Operator
+ aggregations:
+ expr: count(1)
+ bucketGroup: false
+ keys:
+ expr: _col0
+ type: string
+ mode: hash
+ outputColumnNames: _col0, _col1
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: _col0
+ type: string
+ tag: -1
+ value expressions:
+ expr: _col1
+ type: bigint
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ keys:
+ expr: KEY._col0
+ type: string
+ mode: mergepartial
+ outputColumnNames: _col0, _col1
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: bigint
+ outputColumnNames: _col0, _col1
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-4
+ Map Reduce
+ Alias -> Map Operator Tree:
+ a
+ TableScan
+ alias: a
+ Filter Operator
+ predicate:
+ expr: ((((key = '2') or (key = '7')) or (key = '3')) or (key = '8'))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ sort order: ++
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ tag: 0
+ value expressions:
+ expr: key
+ type: string
+ b
+ TableScan
+ alias: b
+ Filter Operator
+ predicate:
+ expr: ((((key = '2') or (key = '7')) or (key = '3')) or (key = '8'))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ sort order: ++
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ tag: 1
+ Reduce Operator Tree:
+ Join Operator
+ condition map:
+ Left Outer Join0 to 1
+ condition expressions:
+ 0 {VALUE._col0}
+ 1
+ handleSkewJoin: false
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: SELECT a.key, count(1) FROM T1 a LEFT OUTER JOIN T2 b ON a.key = b.key and a.val = b.val group by a.key
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+PREHOOK: Input: default@t2
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT a.key, count(1) FROM T1 a LEFT OUTER JOIN T2 b ON a.key = b.key and a.val = b.val group by a.key
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t1
+POSTHOOK: Input: default@t2
+#### A masked pattern was here ####
+1 1
+2 1
+3 1
+7 1
+8 3
Index: ql/src/test/results/clientpositive/skewjoinopt14.q.out
===================================================================
--- ql/src/test/results/clientpositive/skewjoinopt14.q.out (revision 0)
+++ ql/src/test/results/clientpositive/skewjoinopt14.q.out (working copy)
@@ -0,0 +1,314 @@
+PREHOOK: query: CREATE TABLE T1(key STRING, val STRING)
+SKEWED BY (key) ON ((2)) STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: CREATE TABLE T1(key STRING, val STRING)
+SKEWED BY (key) ON ((2)) STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@T1
+PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1
+PREHOOK: type: LOAD
+PREHOOK: Output: default@t1
+POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@t1
+PREHOOK: query: CREATE TABLE T2(key STRING, val STRING) STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: CREATE TABLE T2(key STRING, val STRING) STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@T2
+PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE T2
+PREHOOK: type: LOAD
+PREHOOK: Output: default@t2
+POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE T2
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@t2
+PREHOOK: query: CREATE TABLE T3(key STRING, val STRING)
+SKEWED BY (val) ON ((12)) STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: CREATE TABLE T3(key STRING, val STRING)
+SKEWED BY (val) ON ((12)) STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@T3
+PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T3.txt' INTO TABLE T3
+PREHOOK: type: LOAD
+PREHOOK: Output: default@t3
+POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T3.txt' INTO TABLE T3
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@t3
+PREHOOK: query: -- This test is for skewed join compile time optimization for more than 2 tables.
+-- The join key for table 3 is different from the join key used for joining
+-- tables 1 and 2. Tables 1 and 3 are skewed. Since one of the join sources for table
+-- 3 consist of a sub-query which contains a join, the compile time skew join
+-- optimization is not enabled for table 3, but it is used for the first join between
+-- tables 1 and 2
+EXPLAIN
+select *
+from
+T1 a join T2 b on a.key = b.key
+join T3 c on a.val = c.val
+PREHOOK: type: QUERY
+POSTHOOK: query: -- This test is for skewed join compile time optimization for more than 2 tables.
+-- The join key for table 3 is different from the join key used for joining
+-- tables 1 and 2. Tables 1 and 3 are skewed. Since one of the join sources for table
+-- 3 consist of a sub-query which contains a join, the compile time skew join
+-- optimization is not enabled for table 3, but it is used for the first join between
+-- tables 1 and 2
+EXPLAIN
+select *
+from
+T1 a join T2 b on a.key = b.key
+join T3 c on a.val = c.val
+POSTHOOK: type: QUERY
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_JOIN (TOK_TABREF (TOK_TABNAME T1) a) (TOK_TABREF (TOK_TABNAME T2) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key))) (TOK_TABREF (TOK_TABNAME T3) c) (= (. (TOK_TABLE_OR_COL a) val) (. (TOK_TABLE_OR_COL c) val)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-2 depends on stages: Stage-1, Stage-4
+ Stage-4 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ subquery1:a
+ TableScan
+ alias: a
+ Filter Operator
+ predicate:
+ expr: (not (key = '2'))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ tag: 0
+ value expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ subquery1:b
+ TableScan
+ alias: b
+ Filter Operator
+ predicate:
+ expr: (not (key = '2'))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ tag: 1
+ value expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ Reduce Operator Tree:
+ Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {VALUE._col0} {VALUE._col1}
+ 1 {VALUE._col0} {VALUE._col1}
+ handleSkewJoin: false
+ outputColumnNames: _col0, _col1, _col4, _col5
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-2
+ Map Reduce
+ Alias -> Map Operator Tree:
+ c
+ TableScan
+ alias: c
+ Reduce Output Operator
+ key expressions:
+ expr: val
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: val
+ type: string
+ tag: 1
+ value expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+#### A masked pattern was here ####
+ TableScan
+ Union
+ Select Operator
+ SELECT * : (no compute)
+ Reduce Output Operator
+ key expressions:
+ expr: _col1
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: _col1
+ type: string
+ tag: 0
+ value expressions:
+ expr: _col4
+ type: string
+ expr: _col5
+ type: string
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+#### A masked pattern was here ####
+ TableScan
+ Union
+ Select Operator
+ SELECT * : (no compute)
+ Reduce Output Operator
+ key expressions:
+ expr: _col1
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: _col1
+ type: string
+ tag: 0
+ value expressions:
+ expr: _col4
+ type: string
+ expr: _col5
+ type: string
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ Reduce Operator Tree:
+ Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {VALUE._col0} {VALUE._col1} {VALUE._col4} {VALUE._col5}
+ 1 {VALUE._col0} {VALUE._col1}
+ handleSkewJoin: false
+ outputColumnNames: _col0, _col1, _col4, _col5, _col8, _col9
+ Select Operator
+ expressions:
+ expr: _col4
+ type: string
+ expr: _col5
+ type: string
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col8
+ type: string
+ expr: _col9
+ type: string
+ outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-4
+ Map Reduce
+ Alias -> Map Operator Tree:
+ a
+ TableScan
+ alias: a
+ Filter Operator
+ predicate:
+ expr: (key = '2')
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ tag: 0
+ value expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ b
+ TableScan
+ alias: b
+ Filter Operator
+ predicate:
+ expr: (key = '2')
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ tag: 1
+ value expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ Reduce Operator Tree:
+ Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {VALUE._col0} {VALUE._col1}
+ 1 {VALUE._col0} {VALUE._col1}
+ handleSkewJoin: false
+ outputColumnNames: _col0, _col1, _col4, _col5
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: select *
+from
+T1 a join T2 b on a.key = b.key
+join T3 c on a.val = c.val
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+PREHOOK: Input: default@t2
+PREHOOK: Input: default@t3
+#### A masked pattern was here ####
+POSTHOOK: query: select *
+from
+T1 a join T2 b on a.key = b.key
+join T3 c on a.val = c.val
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t1
+POSTHOOK: Input: default@t2
+POSTHOOK: Input: default@t3
+#### A masked pattern was here ####
+2 12 2 22 2 12
Index: ql/src/test/results/clientpositive/skewjoinopt9.q.out
===================================================================
--- ql/src/test/results/clientpositive/skewjoinopt9.q.out (revision 0)
+++ ql/src/test/results/clientpositive/skewjoinopt9.q.out (working copy)
@@ -0,0 +1,362 @@
+PREHOOK: query: CREATE TABLE T1(key STRING, val STRING)
+SKEWED BY (key) ON ((2)) STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: CREATE TABLE T1(key STRING, val STRING)
+SKEWED BY (key) ON ((2)) STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@T1
+PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1
+PREHOOK: type: LOAD
+PREHOOK: Output: default@t1
+POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@t1
+PREHOOK: query: CREATE TABLE T2(key STRING, val STRING) STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: CREATE TABLE T2(key STRING, val STRING) STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@T2
+PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE T2
+PREHOOK: type: LOAD
+PREHOOK: Output: default@t2
+POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE T2
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@t2
+PREHOOK: query: -- no skew join compile time optimization would be performed if one of the
+-- join sources is a sub-query consisting of a union all
+EXPLAIN
+select * from
+(
+select key, val from T1
+ union all
+select key, val from T1
+) subq1
+join T2 b on subq1.key = b.key
+PREHOOK: type: QUERY
+POSTHOOK: query: -- no skew join compile time optimization would be performed if one of the
+-- join sources is a sub-query consisting of a union all
+EXPLAIN
+select * from
+(
+select key, val from T1
+ union all
+select key, val from T1
+) subq1
+join T2 b on subq1.key = b.key
+POSTHOOK: type: QUERY
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_SUBQUERY (TOK_UNION (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL val))))) (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_TABLE_OR_COL val)))))) subq1) (TOK_TABREF (TOK_TABNAME T2) b) (= (. (TOK_TABLE_OR_COL subq1) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ b
+ TableScan
+ alias: b
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ tag: 1
+ value expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ null-subquery1:subq1-subquery1:t1
+ TableScan
+ alias: t1
+ Select Operator
+ expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ outputColumnNames: _col0, _col1
+ Union
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: _col0
+ type: string
+ tag: 0
+ value expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ null-subquery2:subq1-subquery2:t1
+ TableScan
+ alias: t1
+ Select Operator
+ expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ outputColumnNames: _col0, _col1
+ Union
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: _col0
+ type: string
+ tag: 0
+ value expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ Reduce Operator Tree:
+ Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {VALUE._col0} {VALUE._col1}
+ 1 {VALUE._col0} {VALUE._col1}
+ handleSkewJoin: false
+ outputColumnNames: _col0, _col1, _col2, _col3
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col2
+ type: string
+ expr: _col3
+ type: string
+ outputColumnNames: _col0, _col1, _col2, _col3
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: select * from
+(
+select key, val from T1
+ union all
+select key, val from T1
+) subq1
+join T2 b on subq1.key = b.key
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+PREHOOK: Input: default@t2
+#### A masked pattern was here ####
+POSTHOOK: query: select * from
+(
+select key, val from T1
+ union all
+select key, val from T1
+) subq1
+join T2 b on subq1.key = b.key
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t1
+POSTHOOK: Input: default@t2
+#### A masked pattern was here ####
+2 12 2 22
+2 12 2 22
+3 13 3 13
+3 13 3 13
+8 18 8 18
+8 18 8 18
+8 18 8 18
+8 18 8 18
+8 28 8 18
+8 28 8 18
+8 28 8 18
+8 28 8 18
+PREHOOK: query: -- no skew join compile time optimization would be performed if one of the
+-- join sources is a sub-query consisting of a group by
+EXPLAIN
+select * from
+(
+select key, count(1) as cnt from T1 group by key
+) subq1
+join T2 b on subq1.key = b.key
+PREHOOK: type: QUERY
+POSTHOOK: query: -- no skew join compile time optimization would be performed if one of the
+-- join sources is a sub-query consisting of a group by
+EXPLAIN
+select * from
+(
+select key, count(1) as cnt from T1 group by key
+) subq1
+join T2 b on subq1.key = b.key
+POSTHOOK: type: QUERY
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_SUBQUERY (TOK_QUERY (TOK_FROM (TOK_TABREF (TOK_TABNAME T1))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_TABLE_OR_COL key)) (TOK_SELEXPR (TOK_FUNCTION count 1) cnt)) (TOK_GROUPBY (TOK_TABLE_OR_COL key)))) subq1) (TOK_TABREF (TOK_TABNAME T2) b) (= (. (TOK_TABLE_OR_COL subq1) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-2 depends on stages: Stage-1
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ subq1:t1
+ TableScan
+ alias: t1
+ Select Operator
+ expressions:
+ expr: key
+ type: string
+ outputColumnNames: key
+ Group By Operator
+ aggregations:
+ expr: count(1)
+ bucketGroup: false
+ keys:
+ expr: key
+ type: string
+ mode: hash
+ outputColumnNames: _col0, _col1
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: _col0
+ type: string
+ tag: -1
+ value expressions:
+ expr: _col1
+ type: bigint
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ keys:
+ expr: KEY._col0
+ type: string
+ mode: mergepartial
+ outputColumnNames: _col0, _col1
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: bigint
+ outputColumnNames: _col0, _col1
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-2
+ Map Reduce
+ Alias -> Map Operator Tree:
+ $INTNAME
+ Reduce Output Operator
+ key expressions:
+ expr: _col0
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: _col0
+ type: string
+ tag: 0
+ value expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: bigint
+ b
+ TableScan
+ alias: b
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ tag: 1
+ value expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ Reduce Operator Tree:
+ Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {VALUE._col0} {VALUE._col1}
+ 1 {VALUE._col0} {VALUE._col1}
+ handleSkewJoin: false
+ outputColumnNames: _col0, _col1, _col2, _col3
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: bigint
+ expr: _col2
+ type: string
+ expr: _col3
+ type: string
+ outputColumnNames: _col0, _col1, _col2, _col3
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: select * from
+(
+select key, count(1) as cnt from T1 group by key
+) subq1
+join T2 b on subq1.key = b.key
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+PREHOOK: Input: default@t2
+#### A masked pattern was here ####
+POSTHOOK: query: select * from
+(
+select key, count(1) as cnt from T1 group by key
+) subq1
+join T2 b on subq1.key = b.key
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t1
+POSTHOOK: Input: default@t2
+#### A masked pattern was here ####
+2 1 2 22
+3 1 3 13
+8 2 8 18
+8 2 8 18
Index: ql/src/test/results/clientpositive/skewjoinopt4.q.out
===================================================================
--- ql/src/test/results/clientpositive/skewjoinopt4.q.out (revision 0)
+++ ql/src/test/results/clientpositive/skewjoinopt4.q.out (working copy)
@@ -0,0 +1,440 @@
+PREHOOK: query: CREATE TABLE T1(key STRING, val STRING)
+SKEWED BY (key) ON ((2)) STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: CREATE TABLE T1(key STRING, val STRING)
+SKEWED BY (key) ON ((2)) STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@T1
+PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1
+PREHOOK: type: LOAD
+PREHOOK: Output: default@t1
+POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@t1
+PREHOOK: query: CREATE TABLE T2(key STRING, val STRING) STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: CREATE TABLE T2(key STRING, val STRING) STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@T2
+PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE T2
+PREHOOK: type: LOAD
+PREHOOK: Output: default@t2
+POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE T2
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@t2
+PREHOOK: query: EXPLAIN
+SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key
+PREHOOK: type: QUERY
+POSTHOOK: query: EXPLAIN
+SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key
+POSTHOOK: type: QUERY
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME T1) a) (TOK_TABREF (TOK_TABNAME T2) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_ALLCOLREF (TOK_TABNAME a))) (TOK_SELEXPR (TOK_ALLCOLREF (TOK_TABNAME b))))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-2 depends on stages: Stage-1, Stage-4
+ Stage-4 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ subquery1:a
+ TableScan
+ alias: a
+ Filter Operator
+ predicate:
+ expr: (not (key = '2'))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ tag: 0
+ value expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ subquery1:b
+ TableScan
+ alias: b
+ Filter Operator
+ predicate:
+ expr: (not (key = '2'))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ tag: 1
+ value expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ Reduce Operator Tree:
+ Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {VALUE._col0} {VALUE._col1}
+ 1 {VALUE._col0} {VALUE._col1}
+ handleSkewJoin: false
+ outputColumnNames: _col0, _col1, _col4, _col5
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col4
+ type: string
+ expr: _col5
+ type: string
+ outputColumnNames: _col0, _col1, _col2, _col3
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-2
+ Map Reduce
+ Alias -> Map Operator Tree:
+#### A masked pattern was here ####
+ TableScan
+ Union
+ Select Operator
+ SELECT * : (no compute)
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+#### A masked pattern was here ####
+ TableScan
+ Union
+ Select Operator
+ SELECT * : (no compute)
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-4
+ Map Reduce
+ Alias -> Map Operator Tree:
+ a
+ TableScan
+ alias: a
+ Filter Operator
+ predicate:
+ expr: (key = '2')
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ tag: 0
+ value expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ b
+ TableScan
+ alias: b
+ Filter Operator
+ predicate:
+ expr: (key = '2')
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ tag: 1
+ value expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ Reduce Operator Tree:
+ Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {VALUE._col0} {VALUE._col1}
+ 1 {VALUE._col0} {VALUE._col1}
+ handleSkewJoin: false
+ outputColumnNames: _col0, _col1, _col4, _col5
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col4
+ type: string
+ expr: _col5
+ type: string
+ outputColumnNames: _col0, _col1, _col2, _col3
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+PREHOOK: Input: default@t2
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t1
+POSTHOOK: Input: default@t2
+#### A masked pattern was here ####
+3 13 3 13
+8 18 8 18
+8 18 8 18
+8 28 8 18
+8 28 8 18
+2 12 2 22
+PREHOOK: query: -- the order of the join should not matter, just confirming
+EXPLAIN
+SELECT a.*, b.* FROM T2 a JOIN T1 b ON a.key = b.key
+PREHOOK: type: QUERY
+POSTHOOK: query: -- the order of the join should not matter, just confirming
+EXPLAIN
+SELECT a.*, b.* FROM T2 a JOIN T1 b ON a.key = b.key
+POSTHOOK: type: QUERY
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME T2) a) (TOK_TABREF (TOK_TABNAME T1) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_ALLCOLREF (TOK_TABNAME a))) (TOK_SELEXPR (TOK_ALLCOLREF (TOK_TABNAME b))))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-2 depends on stages: Stage-1, Stage-4
+ Stage-4 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ subquery1:a
+ TableScan
+ alias: a
+ Filter Operator
+ predicate:
+ expr: (not (key = '2'))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ tag: 0
+ value expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ subquery1:b
+ TableScan
+ alias: b
+ Filter Operator
+ predicate:
+ expr: (not (key = '2'))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ tag: 1
+ value expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ Reduce Operator Tree:
+ Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {VALUE._col0} {VALUE._col1}
+ 1 {VALUE._col0} {VALUE._col1}
+ handleSkewJoin: false
+ outputColumnNames: _col0, _col1, _col4, _col5
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col4
+ type: string
+ expr: _col5
+ type: string
+ outputColumnNames: _col0, _col1, _col2, _col3
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-2
+ Map Reduce
+ Alias -> Map Operator Tree:
+#### A masked pattern was here ####
+ TableScan
+ Union
+ Select Operator
+ SELECT * : (no compute)
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+#### A masked pattern was here ####
+ TableScan
+ Union
+ Select Operator
+ SELECT * : (no compute)
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-4
+ Map Reduce
+ Alias -> Map Operator Tree:
+ a
+ TableScan
+ alias: a
+ Filter Operator
+ predicate:
+ expr: (key = '2')
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ tag: 0
+ value expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ b
+ TableScan
+ alias: b
+ Filter Operator
+ predicate:
+ expr: (key = '2')
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ tag: 1
+ value expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ Reduce Operator Tree:
+ Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {VALUE._col0} {VALUE._col1}
+ 1 {VALUE._col0} {VALUE._col1}
+ handleSkewJoin: false
+ outputColumnNames: _col0, _col1, _col4, _col5
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col4
+ type: string
+ expr: _col5
+ type: string
+ outputColumnNames: _col0, _col1, _col2, _col3
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: SELECT a.*, b.* FROM T2 a JOIN T1 b ON a.key = b.key
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+PREHOOK: Input: default@t2
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT a.*, b.* FROM T2 a JOIN T1 b ON a.key = b.key
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t1
+POSTHOOK: Input: default@t2
+#### A masked pattern was here ####
+3 13 3 13
+8 18 8 18
+8 18 8 28
+8 18 8 18
+8 18 8 28
+2 22 2 12
Index: ql/src/test/results/clientpositive/skewjoinopt16.q.out
===================================================================
--- ql/src/test/results/clientpositive/skewjoinopt16.q.out (revision 0)
+++ ql/src/test/results/clientpositive/skewjoinopt16.q.out (working copy)
@@ -0,0 +1,251 @@
+PREHOOK: query: CREATE TABLE T1(key STRING, val STRING)
+SKEWED BY (key, val) ON ((2, 12)) STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: CREATE TABLE T1(key STRING, val STRING)
+SKEWED BY (key, val) ON ((2, 12)) STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@T1
+PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1
+PREHOOK: type: LOAD
+PREHOOK: Output: default@t1
+POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@t1
+PREHOOK: query: CREATE TABLE T2(key STRING, val STRING)
+SKEWED BY (key) ON ((3)) STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: CREATE TABLE T2(key STRING, val STRING)
+SKEWED BY (key) ON ((3)) STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@T2
+PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE T2
+PREHOOK: type: LOAD
+PREHOOK: Output: default@t2
+POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE T2
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@t2
+PREHOOK: query: -- One of the tables is skewed by 2 columns, and the other table is
+-- skewed by one column. Ths join is performed on the both the columns
+
+EXPLAIN
+SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key and a.val = b.val
+PREHOOK: type: QUERY
+POSTHOOK: query: -- One of the tables is skewed by 2 columns, and the other table is
+-- skewed by one column. Ths join is performed on the both the columns
+
+EXPLAIN
+SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key and a.val = b.val
+POSTHOOK: type: QUERY
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME T1) a) (TOK_TABREF (TOK_TABNAME T2) b) (and (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)) (= (. (TOK_TABLE_OR_COL a) val) (. (TOK_TABLE_OR_COL b) val))))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_ALLCOLREF (TOK_TABNAME a))) (TOK_SELEXPR (TOK_ALLCOLREF (TOK_TABNAME b))))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-2 depends on stages: Stage-1, Stage-4
+ Stage-4 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ subquery1:a
+ TableScan
+ alias: a
+ Filter Operator
+ predicate:
+ expr: (not (((key = '2') and (val = '12')) or (key = '3')))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ sort order: ++
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ tag: 0
+ value expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ subquery1:b
+ TableScan
+ alias: b
+ Filter Operator
+ predicate:
+ expr: (not (((key = '2') and (val = '12')) or (key = '3')))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ sort order: ++
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ tag: 1
+ value expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ Reduce Operator Tree:
+ Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {VALUE._col0} {VALUE._col1}
+ 1 {VALUE._col0} {VALUE._col1}
+ handleSkewJoin: false
+ outputColumnNames: _col0, _col1, _col4, _col5
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col4
+ type: string
+ expr: _col5
+ type: string
+ outputColumnNames: _col0, _col1, _col2, _col3
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-2
+ Map Reduce
+ Alias -> Map Operator Tree:
+#### A masked pattern was here ####
+ TableScan
+ Union
+ Select Operator
+ SELECT * : (no compute)
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+#### A masked pattern was here ####
+ TableScan
+ Union
+ Select Operator
+ SELECT * : (no compute)
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-4
+ Map Reduce
+ Alias -> Map Operator Tree:
+ a
+ TableScan
+ alias: a
+ Filter Operator
+ predicate:
+ expr: (((key = '2') and (val = '12')) or (key = '3'))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ sort order: ++
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ tag: 0
+ value expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ b
+ TableScan
+ alias: b
+ Filter Operator
+ predicate:
+ expr: (((key = '2') and (val = '12')) or (key = '3'))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ sort order: ++
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ tag: 1
+ value expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ Reduce Operator Tree:
+ Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {VALUE._col0} {VALUE._col1}
+ 1 {VALUE._col0} {VALUE._col1}
+ handleSkewJoin: false
+ outputColumnNames: _col0, _col1, _col4, _col5
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col4
+ type: string
+ expr: _col5
+ type: string
+ outputColumnNames: _col0, _col1, _col2, _col3
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+PREHOOK: query: SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key and a.val = b.val
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+PREHOOK: Input: default@t2
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key and a.val = b.val
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t1
+POSTHOOK: Input: default@t2
+#### A masked pattern was here ####
+8 18 8 18
+8 18 8 18
+3 13 3 13
Index: ql/src/test/results/clientpositive/skewjoinopt11.q.out
===================================================================
--- ql/src/test/results/clientpositive/skewjoinopt11.q.out (revision 0)
+++ ql/src/test/results/clientpositive/skewjoinopt11.q.out (working copy)
@@ -0,0 +1,459 @@
+PREHOOK: query: CREATE TABLE T1(key STRING, val STRING)
+SKEWED BY (key) ON ((2)) STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: CREATE TABLE T1(key STRING, val STRING)
+SKEWED BY (key) ON ((2)) STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@T1
+PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1
+PREHOOK: type: LOAD
+PREHOOK: Output: default@t1
+POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@t1
+PREHOOK: query: CREATE TABLE T2(key STRING, val STRING) STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: CREATE TABLE T2(key STRING, val STRING) STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@T2
+PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE T2
+PREHOOK: type: LOAD
+PREHOOK: Output: default@t2
+POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE T2
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@t2
+PREHOOK: query: -- This test is to verify the skew join compile optimization when the join is followed
+-- by a union. Both sides of a union consist of a skew join compile time optimization.
+EXPLAIN
+select * from
+(
+ select a.key, a.val as val1, b.val as val2 from T1 a join T2 b on a.key = b.key
+ union all
+ select a.key, a.val as val1, b.val as val2 from T1 a join T2 b on a.key = b.key
+) subq1
+PREHOOK: type: QUERY
+POSTHOOK: query: -- This test is to verify the skew join compile optimization when the join is followed
+-- by a union. Both sides of a union consist of a skew join compile time optimization.
+EXPLAIN
+select * from
+(
+ select a.key, a.val as val1, b.val as val2 from T1 a join T2 b on a.key = b.key
+ union all
+ select a.key, a.val as val1, b.val as val2 from T1 a join T2 b on a.key = b.key
+) subq1
+POSTHOOK: type: QUERY
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_SUBQUERY (TOK_UNION (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME T1) a) (TOK_TABREF (TOK_TABNAME T2) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key)) (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) val) val1) (TOK_SELEXPR (. (TOK_TABLE_OR_COL b) val) val2)))) (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME T1) a) (TOK_TABREF (TOK_TABNAME T2) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) key)) (TOK_SELEXPR (. (TOK_TABLE_OR_COL a) val) val1) (TOK_SELEXPR (. (TOK_TABLE_OR_COL b) val) val2))))) subq1)) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-2 depends on stages: Stage-1, Stage-5
+ Stage-3 depends on stages: Stage-2, Stage-8
+ Stage-5 is a root stage
+ Stage-7 is a root stage
+ Stage-8 depends on stages: Stage-7, Stage-9
+ Stage-9 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ subquery1:a
+ TableScan
+ alias: a
+ Filter Operator
+ predicate:
+ expr: (not (key = '2'))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ tag: 0
+ value expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ subquery1:b
+ TableScan
+ alias: b
+ Filter Operator
+ predicate:
+ expr: (not (key = '2'))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ tag: 1
+ value expressions:
+ expr: val
+ type: string
+ Reduce Operator Tree:
+ Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {VALUE._col0} {VALUE._col1}
+ 1 {VALUE._col1}
+ handleSkewJoin: false
+ outputColumnNames: _col0, _col1, _col5
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col5
+ type: string
+ outputColumnNames: _col0, _col1, _col2
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-2
+ Map Reduce
+ Alias -> Map Operator Tree:
+#### A masked pattern was here ####
+ TableScan
+ Union
+ Select Operator
+ SELECT * : (no compute)
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+#### A masked pattern was here ####
+ TableScan
+ Union
+ Select Operator
+ SELECT * : (no compute)
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-3
+ Map Reduce
+ Alias -> Map Operator Tree:
+#### A masked pattern was here ####
+ TableScan
+ Union
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col2
+ type: string
+ outputColumnNames: _col0, _col1, _col2
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+#### A masked pattern was here ####
+ TableScan
+ Union
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col2
+ type: string
+ outputColumnNames: _col0, _col1, _col2
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-5
+ Map Reduce
+ Alias -> Map Operator Tree:
+ null-subquery2:subq1-subquery2:a
+ TableScan
+ alias: a
+ Filter Operator
+ predicate:
+ expr: (key = '2')
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ tag: 0
+ value expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ null-subquery2:subq1-subquery2:b
+ TableScan
+ alias: b
+ Filter Operator
+ predicate:
+ expr: (key = '2')
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ tag: 1
+ value expressions:
+ expr: val
+ type: string
+ Reduce Operator Tree:
+ Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {VALUE._col0} {VALUE._col1}
+ 1 {VALUE._col1}
+ handleSkewJoin: false
+ outputColumnNames: _col0, _col1, _col5
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col5
+ type: string
+ outputColumnNames: _col0, _col1, _col2
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-7
+ Map Reduce
+ Alias -> Map Operator Tree:
+ null-subquery1:subq1-subquery1:a
+ TableScan
+ alias: a
+ Filter Operator
+ predicate:
+ expr: (key = '2')
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ tag: 0
+ value expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ null-subquery1:subq1-subquery1:b
+ TableScan
+ alias: b
+ Filter Operator
+ predicate:
+ expr: (key = '2')
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ tag: 1
+ value expressions:
+ expr: val
+ type: string
+ Reduce Operator Tree:
+ Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {VALUE._col0} {VALUE._col1}
+ 1 {VALUE._col1}
+ handleSkewJoin: false
+ outputColumnNames: _col0, _col1, _col5
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col5
+ type: string
+ outputColumnNames: _col0, _col1, _col2
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-8
+ Map Reduce
+ Alias -> Map Operator Tree:
+#### A masked pattern was here ####
+ TableScan
+ Union
+ Select Operator
+ SELECT * : (no compute)
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+#### A masked pattern was here ####
+ TableScan
+ Union
+ Select Operator
+ SELECT * : (no compute)
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-9
+ Map Reduce
+ Alias -> Map Operator Tree:
+ subquery2:a
+ TableScan
+ alias: a
+ Filter Operator
+ predicate:
+ expr: (not (key = '2'))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ tag: 0
+ value expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ subquery2:b
+ TableScan
+ alias: b
+ Filter Operator
+ predicate:
+ expr: (not (key = '2'))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ tag: 1
+ value expressions:
+ expr: val
+ type: string
+ Reduce Operator Tree:
+ Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {VALUE._col0} {VALUE._col1}
+ 1 {VALUE._col1}
+ handleSkewJoin: false
+ outputColumnNames: _col0, _col1, _col5
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col5
+ type: string
+ outputColumnNames: _col0, _col1, _col2
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: select * from
+(
+ select a.key, a.val as val1, b.val as val2 from T1 a join T2 b on a.key = b.key
+ union all
+ select a.key, a.val as val1, b.val as val2 from T1 a join T2 b on a.key = b.key
+) subq1
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+PREHOOK: Input: default@t2
+#### A masked pattern was here ####
+POSTHOOK: query: select * from
+(
+ select a.key, a.val as val1, b.val as val2 from T1 a join T2 b on a.key = b.key
+ union all
+ select a.key, a.val as val1, b.val as val2 from T1 a join T2 b on a.key = b.key
+) subq1
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t1
+POSTHOOK: Input: default@t2
+#### A masked pattern was here ####
+3 13 13
+8 18 18
+8 18 18
+8 28 18
+8 28 18
+2 12 22
+2 12 22
+3 13 13
+8 18 18
+8 18 18
+8 28 18
+8 28 18
Index: ql/src/test/results/clientpositive/skewjoinopt20.q.out
===================================================================
--- ql/src/test/results/clientpositive/skewjoinopt20.q.out (revision 0)
+++ ql/src/test/results/clientpositive/skewjoinopt20.q.out (working copy)
@@ -0,0 +1,239 @@
+PREHOOK: query: CREATE TABLE T1(key STRING, val STRING)
+CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS
+SKEWED BY (key) ON ((2)) STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: CREATE TABLE T1(key STRING, val STRING)
+CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS
+SKEWED BY (key) ON ((2)) STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@T1
+PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1
+PREHOOK: type: LOAD
+PREHOOK: Output: default@t1
+POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@t1
+PREHOOK: query: CREATE TABLE T2(key STRING, val STRING) STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: CREATE TABLE T2(key STRING, val STRING) STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@T2
+PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE T2
+PREHOOK: type: LOAD
+PREHOOK: Output: default@t2
+POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE T2
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@t2
+PREHOOK: query: -- add a test where the skewed key is also the bucketized/sorted key
+-- it should not matter, and the compile time skewed join
+-- optimization is performed
+EXPLAIN
+SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key
+PREHOOK: type: QUERY
+POSTHOOK: query: -- add a test where the skewed key is also the bucketized/sorted key
+-- it should not matter, and the compile time skewed join
+-- optimization is performed
+EXPLAIN
+SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key
+POSTHOOK: type: QUERY
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME T1) a) (TOK_TABREF (TOK_TABNAME T2) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_ALLCOLREF (TOK_TABNAME a))) (TOK_SELEXPR (TOK_ALLCOLREF (TOK_TABNAME b))))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-2 depends on stages: Stage-1, Stage-4
+ Stage-4 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ subquery1:a
+ TableScan
+ alias: a
+ Filter Operator
+ predicate:
+ expr: (not (key = '2'))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ tag: 0
+ value expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ subquery1:b
+ TableScan
+ alias: b
+ Filter Operator
+ predicate:
+ expr: (not (key = '2'))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ tag: 1
+ value expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ Reduce Operator Tree:
+ Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {VALUE._col0} {VALUE._col1}
+ 1 {VALUE._col0} {VALUE._col1}
+ handleSkewJoin: false
+ outputColumnNames: _col0, _col1, _col4, _col5
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col4
+ type: string
+ expr: _col5
+ type: string
+ outputColumnNames: _col0, _col1, _col2, _col3
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-2
+ Map Reduce
+ Alias -> Map Operator Tree:
+#### A masked pattern was here ####
+ TableScan
+ Union
+ Select Operator
+ SELECT * : (no compute)
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+#### A masked pattern was here ####
+ TableScan
+ Union
+ Select Operator
+ SELECT * : (no compute)
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-4
+ Map Reduce
+ Alias -> Map Operator Tree:
+ a
+ TableScan
+ alias: a
+ Filter Operator
+ predicate:
+ expr: (key = '2')
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ tag: 0
+ value expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ b
+ TableScan
+ alias: b
+ Filter Operator
+ predicate:
+ expr: (key = '2')
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ tag: 1
+ value expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ Reduce Operator Tree:
+ Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {VALUE._col0} {VALUE._col1}
+ 1 {VALUE._col0} {VALUE._col1}
+ handleSkewJoin: false
+ outputColumnNames: _col0, _col1, _col4, _col5
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col4
+ type: string
+ expr: _col5
+ type: string
+ outputColumnNames: _col0, _col1, _col2, _col3
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+PREHOOK: Input: default@t2
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t1
+POSTHOOK: Input: default@t2
+#### A masked pattern was here ####
+3 13 3 13
+8 18 8 18
+8 18 8 18
+8 28 8 18
+8 28 8 18
+2 12 2 22
Index: ql/src/test/results/clientpositive/skewjoinopt6.q.out
===================================================================
--- ql/src/test/results/clientpositive/skewjoinopt6.q.out (revision 0)
+++ ql/src/test/results/clientpositive/skewjoinopt6.q.out (working copy)
@@ -0,0 +1,241 @@
+PREHOOK: query: CREATE TABLE T1(key STRING, val STRING)
+SKEWED BY (key, val) ON ((2, 12), (8, 18)) STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: CREATE TABLE T1(key STRING, val STRING)
+SKEWED BY (key, val) ON ((2, 12), (8, 18)) STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@T1
+PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1
+PREHOOK: type: LOAD
+PREHOOK: Output: default@t1
+POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@t1
+PREHOOK: query: CREATE TABLE T2(key STRING, val STRING)
+SKEWED BY (key, val) ON ((3, 13), (8, 18)) STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: CREATE TABLE T2(key STRING, val STRING)
+SKEWED BY (key, val) ON ((3, 13), (8, 18)) STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@T2
+PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE T2
+PREHOOK: type: LOAD
+PREHOOK: Output: default@t2
+POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE T2
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@t2
+PREHOOK: query: -- Both the join tables are skewed by 2 keys, and one of the skewed values
+-- is common to both the tables. The join key is a subset of the skewed key set:
+-- it only contains the first skewed key for both the tables
+
+EXPLAIN
+SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key
+PREHOOK: type: QUERY
+POSTHOOK: query: -- Both the join tables are skewed by 2 keys, and one of the skewed values
+-- is common to both the tables. The join key is a subset of the skewed key set:
+-- it only contains the first skewed key for both the tables
+
+EXPLAIN
+SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key
+POSTHOOK: type: QUERY
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME T1) a) (TOK_TABREF (TOK_TABNAME T2) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_ALLCOLREF (TOK_TABNAME a))) (TOK_SELEXPR (TOK_ALLCOLREF (TOK_TABNAME b))))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-2 depends on stages: Stage-1, Stage-4
+ Stage-4 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ subquery1:a
+ TableScan
+ alias: a
+ Filter Operator
+ predicate:
+ expr: (not (((key = '2') or (key = '8')) or (key = '3')))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ tag: 0
+ value expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ subquery1:b
+ TableScan
+ alias: b
+ Filter Operator
+ predicate:
+ expr: (not (((key = '2') or (key = '8')) or (key = '3')))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ tag: 1
+ value expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ Reduce Operator Tree:
+ Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {VALUE._col0} {VALUE._col1}
+ 1 {VALUE._col0} {VALUE._col1}
+ handleSkewJoin: false
+ outputColumnNames: _col0, _col1, _col4, _col5
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col4
+ type: string
+ expr: _col5
+ type: string
+ outputColumnNames: _col0, _col1, _col2, _col3
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-2
+ Map Reduce
+ Alias -> Map Operator Tree:
+#### A masked pattern was here ####
+ TableScan
+ Union
+ Select Operator
+ SELECT * : (no compute)
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+#### A masked pattern was here ####
+ TableScan
+ Union
+ Select Operator
+ SELECT * : (no compute)
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-4
+ Map Reduce
+ Alias -> Map Operator Tree:
+ a
+ TableScan
+ alias: a
+ Filter Operator
+ predicate:
+ expr: (((key = '2') or (key = '8')) or (key = '3'))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ tag: 0
+ value expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ b
+ TableScan
+ alias: b
+ Filter Operator
+ predicate:
+ expr: (((key = '2') or (key = '8')) or (key = '3'))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ tag: 1
+ value expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ Reduce Operator Tree:
+ Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {VALUE._col0} {VALUE._col1}
+ 1 {VALUE._col0} {VALUE._col1}
+ handleSkewJoin: false
+ outputColumnNames: _col0, _col1, _col4, _col5
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col4
+ type: string
+ expr: _col5
+ type: string
+ outputColumnNames: _col0, _col1, _col2, _col3
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+PREHOOK: Input: default@t2
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t1
+POSTHOOK: Input: default@t2
+#### A masked pattern was here ####
+2 12 2 22
+3 13 3 13
+8 18 8 18
+8 18 8 18
+8 28 8 18
+8 28 8 18
Index: ql/src/test/results/clientpositive/skewjoinopt18.q.out
===================================================================
--- ql/src/test/results/clientpositive/skewjoinopt18.q.out (revision 0)
+++ ql/src/test/results/clientpositive/skewjoinopt18.q.out (working copy)
@@ -0,0 +1,160 @@
+PREHOOK: query: CREATE TABLE tmpT1(key STRING, val STRING) STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: CREATE TABLE tmpT1(key STRING, val STRING) STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@tmpT1
+PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE tmpT1
+PREHOOK: type: LOAD
+PREHOOK: Output: default@tmpt1
+POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE tmpT1
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@tmpt1
+PREHOOK: query: -- testing skew on other data types - int
+CREATE TABLE T1(key INT, val STRING) SKEWED BY (key) ON ((2))
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: -- testing skew on other data types - int
+CREATE TABLE T1(key INT, val STRING) SKEWED BY (key) ON ((2))
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@T1
+PREHOOK: query: INSERT OVERWRITE TABLE T1 SELECT key, val FROM tmpT1
+PREHOOK: type: QUERY
+PREHOOK: Input: default@tmpt1
+PREHOOK: Output: default@t1
+POSTHOOK: query: INSERT OVERWRITE TABLE T1 SELECT key, val FROM tmpT1
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@tmpt1
+POSTHOOK: Output: default@t1
+POSTHOOK: Lineage: t1.key EXPRESSION [(tmpt1)tmpt1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t1.val SIMPLE [(tmpt1)tmpt1.FieldSchema(name:val, type:string, comment:null), ]
+PREHOOK: query: -- Tke skewed column is same in both the tables, however it is
+-- INT in one of the tables, and STRING in the other table
+
+CREATE TABLE T2(key STRING, val STRING)
+SKEWED BY (key) ON ((3)) STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: -- Tke skewed column is same in both the tables, however it is
+-- INT in one of the tables, and STRING in the other table
+
+CREATE TABLE T2(key STRING, val STRING)
+SKEWED BY (key) ON ((3)) STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@T2
+POSTHOOK: Lineage: t1.key EXPRESSION [(tmpt1)tmpt1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t1.val SIMPLE [(tmpt1)tmpt1.FieldSchema(name:val, type:string, comment:null), ]
+PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE T2
+PREHOOK: type: LOAD
+PREHOOK: Output: default@t2
+POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE T2
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@t2
+POSTHOOK: Lineage: t1.key EXPRESSION [(tmpt1)tmpt1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t1.val SIMPLE [(tmpt1)tmpt1.FieldSchema(name:val, type:string, comment:null), ]
+PREHOOK: query: -- Once HIVE-3445 is fixed, the compile time skew join optimization would be
+-- applicable here. Till the above jira is fixed, it would be performed as a
+-- regular join
+EXPLAIN
+SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key
+PREHOOK: type: QUERY
+POSTHOOK: query: -- Once HIVE-3445 is fixed, the compile time skew join optimization would be
+-- applicable here. Till the above jira is fixed, it would be performed as a
+-- regular join
+EXPLAIN
+SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: t1.key EXPRESSION [(tmpt1)tmpt1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t1.val SIMPLE [(tmpt1)tmpt1.FieldSchema(name:val, type:string, comment:null), ]
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME T1) a) (TOK_TABREF (TOK_TABNAME T2) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_ALLCOLREF (TOK_TABNAME a))) (TOK_SELEXPR (TOK_ALLCOLREF (TOK_TABNAME b))))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ a
+ TableScan
+ alias: a
+ Reduce Output Operator
+ key expressions:
+ expr: UDFToDouble(key)
+ type: double
+ sort order: +
+ Map-reduce partition columns:
+ expr: UDFToDouble(key)
+ type: double
+ tag: 0
+ value expressions:
+ expr: key
+ type: int
+ expr: val
+ type: string
+ b
+ TableScan
+ alias: b
+ Reduce Output Operator
+ key expressions:
+ expr: UDFToDouble(key)
+ type: double
+ sort order: +
+ Map-reduce partition columns:
+ expr: UDFToDouble(key)
+ type: double
+ tag: 1
+ value expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ Reduce Operator Tree:
+ Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {VALUE._col0} {VALUE._col1}
+ 1 {VALUE._col0} {VALUE._col1}
+ handleSkewJoin: false
+ outputColumnNames: _col0, _col1, _col4, _col5
+ Select Operator
+ expressions:
+ expr: _col0
+ type: int
+ expr: _col1
+ type: string
+ expr: _col4
+ type: string
+ expr: _col5
+ type: string
+ outputColumnNames: _col0, _col1, _col2, _col3
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+PREHOOK: Input: default@t2
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t1
+POSTHOOK: Input: default@t2
+#### A masked pattern was here ####
+POSTHOOK: Lineage: t1.key EXPRESSION [(tmpt1)tmpt1.FieldSchema(name:key, type:string, comment:null), ]
+POSTHOOK: Lineage: t1.val SIMPLE [(tmpt1)tmpt1.FieldSchema(name:val, type:string, comment:null), ]
+2 12 2 22
+3 13 3 13
+8 18 8 18
+8 18 8 18
+8 28 8 18
+8 28 8 18
Index: ql/src/test/results/clientpositive/skewjoinopt1.q.out
===================================================================
--- ql/src/test/results/clientpositive/skewjoinopt1.q.out (revision 0)
+++ ql/src/test/results/clientpositive/skewjoinopt1.q.out (working copy)
@@ -0,0 +1,834 @@
+PREHOOK: query: CREATE TABLE T1(key STRING, val STRING)
+SKEWED BY (key) ON ((2)) STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: CREATE TABLE T1(key STRING, val STRING)
+SKEWED BY (key) ON ((2)) STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@T1
+PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1
+PREHOOK: type: LOAD
+PREHOOK: Output: default@t1
+POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@t1
+PREHOOK: query: CREATE TABLE T2(key STRING, val STRING)
+SKEWED BY (key) ON ((3)) STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: CREATE TABLE T2(key STRING, val STRING)
+SKEWED BY (key) ON ((3)) STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@T2
+PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE T2
+PREHOOK: type: LOAD
+PREHOOK: Output: default@t2
+POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE T2
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@t2
+PREHOOK: query: -- a simple query with skew on both the tables on different keys
+
+EXPLAIN
+SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key
+PREHOOK: type: QUERY
+POSTHOOK: query: -- a simple query with skew on both the tables on different keys
+
+EXPLAIN
+SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key
+POSTHOOK: type: QUERY
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME T1) a) (TOK_TABREF (TOK_TABNAME T2) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_ALLCOLREF (TOK_TABNAME a))) (TOK_SELEXPR (TOK_ALLCOLREF (TOK_TABNAME b))))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-2 depends on stages: Stage-1, Stage-4
+ Stage-4 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ subquery1:a
+ TableScan
+ alias: a
+ Filter Operator
+ predicate:
+ expr: (not ((key = '2') or (key = '3')))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ tag: 0
+ value expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ subquery1:b
+ TableScan
+ alias: b
+ Filter Operator
+ predicate:
+ expr: (not ((key = '2') or (key = '3')))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ tag: 1
+ value expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ Reduce Operator Tree:
+ Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {VALUE._col0} {VALUE._col1}
+ 1 {VALUE._col0} {VALUE._col1}
+ handleSkewJoin: false
+ outputColumnNames: _col0, _col1, _col4, _col5
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col4
+ type: string
+ expr: _col5
+ type: string
+ outputColumnNames: _col0, _col1, _col2, _col3
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-2
+ Map Reduce
+ Alias -> Map Operator Tree:
+#### A masked pattern was here ####
+ TableScan
+ Union
+ Select Operator
+ SELECT * : (no compute)
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+#### A masked pattern was here ####
+ TableScan
+ Union
+ Select Operator
+ SELECT * : (no compute)
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-4
+ Map Reduce
+ Alias -> Map Operator Tree:
+ a
+ TableScan
+ alias: a
+ Filter Operator
+ predicate:
+ expr: ((key = '2') or (key = '3'))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ tag: 0
+ value expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ b
+ TableScan
+ alias: b
+ Filter Operator
+ predicate:
+ expr: ((key = '2') or (key = '3'))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ tag: 1
+ value expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ Reduce Operator Tree:
+ Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {VALUE._col0} {VALUE._col1}
+ 1 {VALUE._col0} {VALUE._col1}
+ handleSkewJoin: false
+ outputColumnNames: _col0, _col1, _col4, _col5
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col4
+ type: string
+ expr: _col5
+ type: string
+ outputColumnNames: _col0, _col1, _col2, _col3
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+PREHOOK: Input: default@t2
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t1
+POSTHOOK: Input: default@t2
+#### A masked pattern was here ####
+8 18 8 18
+8 18 8 18
+8 28 8 18
+8 28 8 18
+2 12 2 22
+3 13 3 13
+PREHOOK: query: -- test outer joins also
+
+EXPLAIN
+SELECT a.*, b.* FROM T1 a RIGHT OUTER JOIN T2 b ON a.key = b.key
+PREHOOK: type: QUERY
+POSTHOOK: query: -- test outer joins also
+
+EXPLAIN
+SELECT a.*, b.* FROM T1 a RIGHT OUTER JOIN T2 b ON a.key = b.key
+POSTHOOK: type: QUERY
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_RIGHTOUTERJOIN (TOK_TABREF (TOK_TABNAME T1) a) (TOK_TABREF (TOK_TABNAME T2) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_ALLCOLREF (TOK_TABNAME a))) (TOK_SELEXPR (TOK_ALLCOLREF (TOK_TABNAME b))))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-2 depends on stages: Stage-1, Stage-4
+ Stage-4 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ subquery1:a
+ TableScan
+ alias: a
+ Filter Operator
+ predicate:
+ expr: (not ((key = '2') or (key = '3')))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ tag: 0
+ value expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ subquery1:b
+ TableScan
+ alias: b
+ Filter Operator
+ predicate:
+ expr: (not ((key = '2') or (key = '3')))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ tag: 1
+ value expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ Reduce Operator Tree:
+ Join Operator
+ condition map:
+ Right Outer Join0 to 1
+ condition expressions:
+ 0 {VALUE._col0} {VALUE._col1}
+ 1 {VALUE._col0} {VALUE._col1}
+ handleSkewJoin: false
+ outputColumnNames: _col0, _col1, _col4, _col5
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col4
+ type: string
+ expr: _col5
+ type: string
+ outputColumnNames: _col0, _col1, _col2, _col3
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-2
+ Map Reduce
+ Alias -> Map Operator Tree:
+#### A masked pattern was here ####
+ TableScan
+ Union
+ Select Operator
+ SELECT * : (no compute)
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+#### A masked pattern was here ####
+ TableScan
+ Union
+ Select Operator
+ SELECT * : (no compute)
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-4
+ Map Reduce
+ Alias -> Map Operator Tree:
+ a
+ TableScan
+ alias: a
+ Filter Operator
+ predicate:
+ expr: ((key = '2') or (key = '3'))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ tag: 0
+ value expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ b
+ TableScan
+ alias: b
+ Filter Operator
+ predicate:
+ expr: ((key = '2') or (key = '3'))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ tag: 1
+ value expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ Reduce Operator Tree:
+ Join Operator
+ condition map:
+ Right Outer Join0 to 1
+ condition expressions:
+ 0 {VALUE._col0} {VALUE._col1}
+ 1 {VALUE._col0} {VALUE._col1}
+ handleSkewJoin: false
+ outputColumnNames: _col0, _col1, _col4, _col5
+ Select Operator
+ expressions:
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col4
+ type: string
+ expr: _col5
+ type: string
+ outputColumnNames: _col0, _col1, _col2, _col3
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: SELECT a.*, b.* FROM T1 a RIGHT OUTER JOIN T2 b ON a.key = b.key
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+PREHOOK: Input: default@t2
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT a.*, b.* FROM T1 a RIGHT OUTER JOIN T2 b ON a.key = b.key
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t1
+POSTHOOK: Input: default@t2
+#### A masked pattern was here ####
+NULL NULL 4 14
+NULL NULL 5 15
+8 18 8 18
+8 18 8 18
+8 28 8 18
+8 28 8 18
+2 12 2 22
+3 13 3 13
+PREHOOK: query: -- an aggregation at the end should not change anything
+
+EXPLAIN
+SELECT count(1) FROM T1 a JOIN T2 b ON a.key = b.key
+PREHOOK: type: QUERY
+POSTHOOK: query: -- an aggregation at the end should not change anything
+
+EXPLAIN
+SELECT count(1) FROM T1 a JOIN T2 b ON a.key = b.key
+POSTHOOK: type: QUERY
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_TABREF (TOK_TABNAME T1) a) (TOK_TABREF (TOK_TABNAME T2) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTION count 1)))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-2 depends on stages: Stage-1, Stage-4
+ Stage-4 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ subquery1:a
+ TableScan
+ alias: a
+ Filter Operator
+ predicate:
+ expr: (not ((key = '2') or (key = '3')))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ tag: 0
+ subquery1:b
+ TableScan
+ alias: b
+ Filter Operator
+ predicate:
+ expr: (not ((key = '2') or (key = '3')))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ tag: 1
+ Reduce Operator Tree:
+ Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ Select Operator
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-2
+ Map Reduce
+ Alias -> Map Operator Tree:
+#### A masked pattern was here ####
+ TableScan
+ Union
+ Select Operator
+ SELECT * : (no compute)
+ Group By Operator
+ aggregations:
+ expr: count(1)
+ bucketGroup: false
+ mode: hash
+ outputColumnNames: _col0
+ Reduce Output Operator
+ sort order:
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: bigint
+#### A masked pattern was here ####
+ TableScan
+ Union
+ Select Operator
+ SELECT * : (no compute)
+ Group By Operator
+ aggregations:
+ expr: count(1)
+ bucketGroup: false
+ mode: hash
+ outputColumnNames: _col0
+ Reduce Output Operator
+ sort order:
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: bigint
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ mode: mergepartial
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: bigint
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-4
+ Map Reduce
+ Alias -> Map Operator Tree:
+ a
+ TableScan
+ alias: a
+ Filter Operator
+ predicate:
+ expr: ((key = '2') or (key = '3'))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ tag: 0
+ b
+ TableScan
+ alias: b
+ Filter Operator
+ predicate:
+ expr: ((key = '2') or (key = '3'))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ tag: 1
+ Reduce Operator Tree:
+ Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ Select Operator
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: SELECT count(1) FROM T1 a JOIN T2 b ON a.key = b.key
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+PREHOOK: Input: default@t2
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT count(1) FROM T1 a JOIN T2 b ON a.key = b.key
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t1
+POSTHOOK: Input: default@t2
+#### A masked pattern was here ####
+6
+PREHOOK: query: EXPLAIN
+SELECT count(1) FROM T1 a RIGHT OUTER JOIN T2 b ON a.key = b.key
+PREHOOK: type: QUERY
+POSTHOOK: query: EXPLAIN
+SELECT count(1) FROM T1 a RIGHT OUTER JOIN T2 b ON a.key = b.key
+POSTHOOK: type: QUERY
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_RIGHTOUTERJOIN (TOK_TABREF (TOK_TABNAME T1) a) (TOK_TABREF (TOK_TABNAME T2) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR (TOK_FUNCTION count 1)))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-2 depends on stages: Stage-1, Stage-4
+ Stage-4 is a root stage
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ subquery1:a
+ TableScan
+ alias: a
+ Filter Operator
+ predicate:
+ expr: (not ((key = '2') or (key = '3')))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ tag: 0
+ subquery1:b
+ TableScan
+ alias: b
+ Filter Operator
+ predicate:
+ expr: (not ((key = '2') or (key = '3')))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ tag: 1
+ Reduce Operator Tree:
+ Join Operator
+ condition map:
+ Right Outer Join0 to 1
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ Select Operator
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-2
+ Map Reduce
+ Alias -> Map Operator Tree:
+#### A masked pattern was here ####
+ TableScan
+ Union
+ Select Operator
+ SELECT * : (no compute)
+ Group By Operator
+ aggregations:
+ expr: count(1)
+ bucketGroup: false
+ mode: hash
+ outputColumnNames: _col0
+ Reduce Output Operator
+ sort order:
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: bigint
+#### A masked pattern was here ####
+ TableScan
+ Union
+ Select Operator
+ SELECT * : (no compute)
+ Group By Operator
+ aggregations:
+ expr: count(1)
+ bucketGroup: false
+ mode: hash
+ outputColumnNames: _col0
+ Reduce Output Operator
+ sort order:
+ tag: -1
+ value expressions:
+ expr: _col0
+ type: bigint
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations:
+ expr: count(VALUE._col0)
+ bucketGroup: false
+ mode: mergepartial
+ outputColumnNames: _col0
+ Select Operator
+ expressions:
+ expr: _col0
+ type: bigint
+ outputColumnNames: _col0
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-4
+ Map Reduce
+ Alias -> Map Operator Tree:
+ a
+ TableScan
+ alias: a
+ Filter Operator
+ predicate:
+ expr: ((key = '2') or (key = '3'))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ tag: 0
+ b
+ TableScan
+ alias: b
+ Filter Operator
+ predicate:
+ expr: ((key = '2') or (key = '3'))
+ type: boolean
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ tag: 1
+ Reduce Operator Tree:
+ Join Operator
+ condition map:
+ Right Outer Join0 to 1
+ condition expressions:
+ 0
+ 1
+ handleSkewJoin: false
+ Select Operator
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: SELECT count(1) FROM T1 a RIGHT OUTER JOIN T2 b ON a.key = b.key
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+PREHOOK: Input: default@t2
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT count(1) FROM T1 a RIGHT OUTER JOIN T2 b ON a.key = b.key
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t1
+POSTHOOK: Input: default@t2
+#### A masked pattern was here ####
+8
Index: ql/src/test/results/clientpositive/skewjoinopt13.q.out
===================================================================
--- ql/src/test/results/clientpositive/skewjoinopt13.q.out (revision 0)
+++ ql/src/test/results/clientpositive/skewjoinopt13.q.out (working copy)
@@ -0,0 +1,215 @@
+PREHOOK: query: CREATE TABLE T1(key STRING, val STRING) STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: CREATE TABLE T1(key STRING, val STRING) STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@T1
+PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1
+PREHOOK: type: LOAD
+PREHOOK: Output: default@t1
+POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@t1
+PREHOOK: query: CREATE TABLE T2(key STRING, val STRING) STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: CREATE TABLE T2(key STRING, val STRING) STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@T2
+PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE T2
+PREHOOK: type: LOAD
+PREHOOK: Output: default@t2
+POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE T2
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@t2
+PREHOOK: query: CREATE TABLE T3(key STRING, val STRING)
+SKEWED BY (val) ON ((12)) STORED AS TEXTFILE
+PREHOOK: type: CREATETABLE
+POSTHOOK: query: CREATE TABLE T3(key STRING, val STRING)
+SKEWED BY (val) ON ((12)) STORED AS TEXTFILE
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: default@T3
+PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T3.txt' INTO TABLE T3
+PREHOOK: type: LOAD
+PREHOOK: Output: default@t3
+POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/T3.txt' INTO TABLE T3
+POSTHOOK: type: LOAD
+POSTHOOK: Output: default@t3
+PREHOOK: query: -- This test is for skewed join compile time optimization for more than 2 tables.
+-- The join key for table 3 is different from the join key used for joining
+-- tables 1 and 2. Table 3 is skewed, but since one of the join sources for table
+-- 3 consist of a sub-query which contains a join, the compile time skew join
+-- optimization is not performed
+
+EXPLAIN
+select *
+from
+T1 a join T2 b on a.key = b.key
+join T3 c on a.val = c.val
+PREHOOK: type: QUERY
+POSTHOOK: query: -- This test is for skewed join compile time optimization for more than 2 tables.
+-- The join key for table 3 is different from the join key used for joining
+-- tables 1 and 2. Table 3 is skewed, but since one of the join sources for table
+-- 3 consist of a sub-query which contains a join, the compile time skew join
+-- optimization is not performed
+
+EXPLAIN
+select *
+from
+T1 a join T2 b on a.key = b.key
+join T3 c on a.val = c.val
+POSTHOOK: type: QUERY
+ABSTRACT SYNTAX TREE:
+ (TOK_QUERY (TOK_FROM (TOK_JOIN (TOK_JOIN (TOK_TABREF (TOK_TABNAME T1) a) (TOK_TABREF (TOK_TABNAME T2) b) (= (. (TOK_TABLE_OR_COL a) key) (. (TOK_TABLE_OR_COL b) key))) (TOK_TABREF (TOK_TABNAME T3) c) (= (. (TOK_TABLE_OR_COL a) val) (. (TOK_TABLE_OR_COL c) val)))) (TOK_INSERT (TOK_DESTINATION (TOK_DIR TOK_TMP_FILE)) (TOK_SELECT (TOK_SELEXPR TOK_ALLCOLREF))))
+
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-2 depends on stages: Stage-1
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Alias -> Map Operator Tree:
+ a
+ TableScan
+ alias: a
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ tag: 0
+ value expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ b
+ TableScan
+ alias: b
+ Reduce Output Operator
+ key expressions:
+ expr: key
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: key
+ type: string
+ tag: 1
+ value expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ Reduce Operator Tree:
+ Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {VALUE._col0} {VALUE._col1}
+ 1 {VALUE._col0} {VALUE._col1}
+ handleSkewJoin: false
+ outputColumnNames: _col0, _col1, _col4, _col5
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+
+ Stage: Stage-2
+ Map Reduce
+ Alias -> Map Operator Tree:
+ $INTNAME
+ Reduce Output Operator
+ key expressions:
+ expr: _col1
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: _col1
+ type: string
+ tag: 0
+ value expressions:
+ expr: _col4
+ type: string
+ expr: _col5
+ type: string
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ c
+ TableScan
+ alias: c
+ Reduce Output Operator
+ key expressions:
+ expr: val
+ type: string
+ sort order: +
+ Map-reduce partition columns:
+ expr: val
+ type: string
+ tag: 1
+ value expressions:
+ expr: key
+ type: string
+ expr: val
+ type: string
+ Reduce Operator Tree:
+ Join Operator
+ condition map:
+ Inner Join 0 to 1
+ condition expressions:
+ 0 {VALUE._col0} {VALUE._col1} {VALUE._col4} {VALUE._col5}
+ 1 {VALUE._col0} {VALUE._col1}
+ handleSkewJoin: false
+ outputColumnNames: _col0, _col1, _col4, _col5, _col8, _col9
+ Select Operator
+ expressions:
+ expr: _col4
+ type: string
+ expr: _col5
+ type: string
+ expr: _col0
+ type: string
+ expr: _col1
+ type: string
+ expr: _col8
+ type: string
+ expr: _col9
+ type: string
+ outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5
+ File Output Operator
+ compressed: false
+ GlobalTableId: 0
+ table:
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+
+
+PREHOOK: query: select *
+from
+T1 a join T2 b on a.key = b.key
+join T3 c on a.val = c.val
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+PREHOOK: Input: default@t2
+PREHOOK: Input: default@t3
+#### A masked pattern was here ####
+POSTHOOK: query: select *
+from
+T1 a join T2 b on a.key = b.key
+join T3 c on a.val = c.val
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t1
+POSTHOOK: Input: default@t2
+POSTHOOK: Input: default@t3
+#### A masked pattern was here ####
+2 12 2 22 2 12
Index: ql/src/test/queries/clientpositive/skewjoinopt1.q
===================================================================
--- ql/src/test/queries/clientpositive/skewjoinopt1.q (revision 0)
+++ ql/src/test/queries/clientpositive/skewjoinopt1.q (working copy)
@@ -0,0 +1,38 @@
+set hive.internal.ddl.list.bucketing.enable=true;
+set hive.optimize.skewjoin.compiletime = true;
+
+CREATE TABLE T1(key STRING, val STRING)
+SKEWED BY (key) ON ((2)) STORED AS TEXTFILE;
+
+LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1;
+
+CREATE TABLE T2(key STRING, val STRING)
+SKEWED BY (key) ON ((3)) STORED AS TEXTFILE;
+
+LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE T2;
+
+-- a simple join query with skew on both the tables on the join key
+
+EXPLAIN
+SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key;
+
+SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key;
+
+-- test outer joins also
+
+EXPLAIN
+SELECT a.*, b.* FROM T1 a RIGHT OUTER JOIN T2 b ON a.key = b.key;
+
+SELECT a.*, b.* FROM T1 a RIGHT OUTER JOIN T2 b ON a.key = b.key;
+
+-- an aggregation at the end should not change anything
+
+EXPLAIN
+SELECT count(1) FROM T1 a JOIN T2 b ON a.key = b.key;
+
+SELECT count(1) FROM T1 a JOIN T2 b ON a.key = b.key;
+
+EXPLAIN
+SELECT count(1) FROM T1 a RIGHT OUTER JOIN T2 b ON a.key = b.key;
+
+SELECT count(1) FROM T1 a RIGHT OUTER JOIN T2 b ON a.key = b.key;
Index: ql/src/test/queries/clientpositive/skewjoinopt19.q
===================================================================
--- ql/src/test/queries/clientpositive/skewjoinopt19.q (revision 0)
+++ ql/src/test/queries/clientpositive/skewjoinopt19.q (working copy)
@@ -0,0 +1,20 @@
+set hive.internal.ddl.list.bucketing.enable=true;
+set hive.optimize.skewjoin.compiletime = true;
+
+CREATE TABLE T1(key STRING, val STRING)
+CLUSTERED BY (key) INTO 4 BUCKETS
+SKEWED BY (key) ON ((2)) STORED AS TEXTFILE;
+
+LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1;
+
+CREATE TABLE T2(key STRING, val STRING) STORED AS TEXTFILE;
+
+LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE T2;
+
+-- add a test where the skewed key is also the bucketized key
+-- it should not matter, and the compile time skewed join
+-- optimization is performed
+EXPLAIN
+SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key;
+
+SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key;
Index: ql/src/test/queries/clientpositive/skewjoinopt3.q
===================================================================
--- ql/src/test/queries/clientpositive/skewjoinopt3.q (revision 0)
+++ ql/src/test/queries/clientpositive/skewjoinopt3.q (working copy)
@@ -0,0 +1,28 @@
+set hive.internal.ddl.list.bucketing.enable=true;
+set hive.optimize.skewjoin.compiletime = true;
+
+CREATE TABLE T1(key STRING, val STRING)
+SKEWED BY (key) ON ((2), (8)) STORED AS TEXTFILE;
+
+LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1;
+
+CREATE TABLE T2(key STRING, val STRING)
+SKEWED BY (key) ON ((3), (8)) STORED AS TEXTFILE;
+
+LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE T2;
+
+-- a simple query with skew on both the tables. One of the skewed
+-- value is common to both the tables. The skewed value should not be
+-- repeated in the filter.
+
+EXPLAIN
+SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key;
+
+SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key;
+
+-- test outer joins also
+
+EXPLAIN
+SELECT a.*, b.* FROM T1 a FULL OUTER JOIN T2 b ON a.key = b.key;
+
+SELECT a.*, b.* FROM T1 a FULL OUTER JOIN T2 b ON a.key = b.key;
Index: ql/src/test/queries/clientpositive/skewjoinopt5.q
===================================================================
--- ql/src/test/queries/clientpositive/skewjoinopt5.q (revision 0)
+++ ql/src/test/queries/clientpositive/skewjoinopt5.q (working copy)
@@ -0,0 +1,20 @@
+set hive.internal.ddl.list.bucketing.enable=true;
+set hive.optimize.skewjoin.compiletime = true;
+
+CREATE TABLE T1(key STRING, val STRING)
+SKEWED BY (key, val) ON ((2, 12)) STORED AS TEXTFILE;
+
+LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1;
+
+CREATE TABLE T2(key STRING, val STRING)
+SKEWED BY (key) ON ((3)) STORED AS TEXTFILE;
+
+LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE T2;
+
+-- One of the tables is skewed by 2 columns, and the other table is
+-- skewed by one column. Ths join is performed on the first skewed column
+
+EXPLAIN
+SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key;
+
+SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key;
Index: ql/src/test/queries/clientpositive/skewjoinopt7.q
===================================================================
--- ql/src/test/queries/clientpositive/skewjoinopt7.q (revision 0)
+++ ql/src/test/queries/clientpositive/skewjoinopt7.q (working copy)
@@ -0,0 +1,24 @@
+set hive.internal.ddl.list.bucketing.enable=true;
+set hive.optimize.skewjoin.compiletime = true;
+
+CREATE TABLE T1(key STRING, val STRING)
+SKEWED BY (key) ON ((2), (8)) STORED AS TEXTFILE;
+
+LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1;
+
+CREATE TABLE T2(key STRING, val STRING)
+SKEWED BY (key) ON ((3), (8)) STORED AS TEXTFILE;
+
+LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE T2;
+
+CREATE TABLE T3(key STRING, val STRING) STORED AS TEXTFILE;
+
+LOAD DATA LOCAL INPATH '../data/files/T3.txt' INTO TABLE T3;
+
+-- This test is for validating skewed join compile time optimization for more than
+-- 2 tables. The join key is the same, and so a 3-way join would be performed.
+-- 2 of the 3 tables are skewed on the join key
+EXPLAIN
+SELECT a.*, b.*, c.* FROM T1 a JOIN T2 b ON a.key = b.key JOIN T3 c on a.key = c.key;
+
+SELECT a.*, b.*, c.* FROM T1 a JOIN T2 b ON a.key = b.key JOIN T3 c on a.key = c.key;
Index: ql/src/test/queries/clientpositive/skewjoinopt10.q
===================================================================
--- ql/src/test/queries/clientpositive/skewjoinopt10.q (revision 0)
+++ ql/src/test/queries/clientpositive/skewjoinopt10.q (working copy)
@@ -0,0 +1,17 @@
+set hive.internal.ddl.list.bucketing.enable=true;
+set hive.optimize.skewjoin.compiletime = true;
+
+CREATE TABLE T1(key STRING, value STRING) STORED AS TEXTFILE;
+
+LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1;
+
+drop table array_valued_T1;
+create table array_valued_T1 (key string, value array) SKEWED BY (key) ON ((8));
+insert overwrite table array_valued_T1 select key, array(value) from T1;
+
+-- This test is to verify the skew join compile optimization when the join is followed by a lateral view
+explain
+select * from (select a.key as key, b.value as array_val from T1 a join array_valued_T1 b on a.key=b.key) i lateral view explode (array_val) c as val;
+
+select * from (select a.key as key, b.value as array_val from T1 a join array_valued_T1 b on a.key=b.key) i lateral view explode (array_val) c as val;
+
Index: ql/src/test/queries/clientpositive/skewjoinopt9.q
===================================================================
--- ql/src/test/queries/clientpositive/skewjoinopt9.q (revision 0)
+++ ql/src/test/queries/clientpositive/skewjoinopt9.q (working copy)
@@ -0,0 +1,45 @@
+set hive.internal.ddl.list.bucketing.enable=true;
+set hive.optimize.skewjoin.compiletime = true;
+
+CREATE TABLE T1(key STRING, val STRING)
+SKEWED BY (key) ON ((2)) STORED AS TEXTFILE;
+
+LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1;
+
+CREATE TABLE T2(key STRING, val STRING) STORED AS TEXTFILE;
+
+LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE T2;
+
+-- no skew join compile time optimization would be performed if one of the
+-- join sources is a sub-query consisting of a union all
+EXPLAIN
+select * from
+(
+select key, val from T1
+ union all
+select key, val from T1
+) subq1
+join T2 b on subq1.key = b.key;
+
+select * from
+(
+select key, val from T1
+ union all
+select key, val from T1
+) subq1
+join T2 b on subq1.key = b.key;
+
+-- no skew join compile time optimization would be performed if one of the
+-- join sources is a sub-query consisting of a group by
+EXPLAIN
+select * from
+(
+select key, count(1) as cnt from T1 group by key
+) subq1
+join T2 b on subq1.key = b.key;
+
+select * from
+(
+select key, count(1) as cnt from T1 group by key
+) subq1
+join T2 b on subq1.key = b.key;
Index: ql/src/test/queries/clientpositive/skewjoinopt12.q
===================================================================
--- ql/src/test/queries/clientpositive/skewjoinopt12.q (revision 0)
+++ ql/src/test/queries/clientpositive/skewjoinopt12.q (working copy)
@@ -0,0 +1,20 @@
+set hive.internal.ddl.list.bucketing.enable=true;
+set hive.optimize.skewjoin.compiletime = true;
+
+CREATE TABLE T1(key STRING, val STRING)
+SKEWED BY (key, val) ON ((2, 12), (8, 18)) STORED AS TEXTFILE;
+
+LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1;
+
+CREATE TABLE T2(key STRING, val STRING)
+SKEWED BY (key, val) ON ((3, 13), (8, 18)) STORED AS TEXTFILE;
+
+LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE T2;
+
+-- Both the join tables are skewed by 2 keys, and one of the skewed values
+-- is common to both the tables. The join key matches the skewed key set.
+
+EXPLAIN
+SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key and a.val = b.val;
+
+SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key and a.val = b.val;
Index: ql/src/test/queries/clientpositive/skewjoinopt14.q
===================================================================
--- ql/src/test/queries/clientpositive/skewjoinopt14.q (revision 0)
+++ ql/src/test/queries/clientpositive/skewjoinopt14.q (working copy)
@@ -0,0 +1,34 @@
+set hive.internal.ddl.list.bucketing.enable=true;
+set hive.optimize.skewjoin.compiletime = true;
+
+CREATE TABLE T1(key STRING, val STRING)
+SKEWED BY (key) ON ((2)) STORED AS TEXTFILE;
+
+LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1;
+
+CREATE TABLE T2(key STRING, val STRING) STORED AS TEXTFILE;
+
+LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE T2;
+
+CREATE TABLE T3(key STRING, val STRING)
+SKEWED BY (val) ON ((12)) STORED AS TEXTFILE;
+
+LOAD DATA LOCAL INPATH '../data/files/T3.txt' INTO TABLE T3;
+
+-- This test is for skewed join compile time optimization for more than 2 tables.
+-- The join key for table 3 is different from the join key used for joining
+-- tables 1 and 2. Tables 1 and 3 are skewed. Since one of the join sources for table
+-- 3 consist of a sub-query which contains a join, the compile time skew join
+-- optimization is not enabled for table 3, but it is used for the first join between
+-- tables 1 and 2
+EXPLAIN
+select *
+from
+T1 a join T2 b on a.key = b.key
+join T3 c on a.val = c.val;
+
+select *
+from
+T1 a join T2 b on a.key = b.key
+join T3 c on a.val = c.val;
+
Index: ql/src/test/queries/clientpositive/skewjoinopt16.q
===================================================================
--- ql/src/test/queries/clientpositive/skewjoinopt16.q (revision 0)
+++ ql/src/test/queries/clientpositive/skewjoinopt16.q (working copy)
@@ -0,0 +1,20 @@
+set hive.internal.ddl.list.bucketing.enable=true;
+set hive.optimize.skewjoin.compiletime = true;
+
+CREATE TABLE T1(key STRING, val STRING)
+SKEWED BY (key, val) ON ((2, 12)) STORED AS TEXTFILE;
+
+LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1;
+
+CREATE TABLE T2(key STRING, val STRING)
+SKEWED BY (key) ON ((3)) STORED AS TEXTFILE;
+
+LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE T2;
+
+-- One of the tables is skewed by 2 columns, and the other table is
+-- skewed by one column. Ths join is performed on the both the columns
+
+EXPLAIN
+SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key and a.val = b.val;
+
+SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key and a.val = b.val;
Index: ql/src/test/queries/clientpositive/skewjoinopt18.q
===================================================================
--- ql/src/test/queries/clientpositive/skewjoinopt18.q (revision 0)
+++ ql/src/test/queries/clientpositive/skewjoinopt18.q (working copy)
@@ -0,0 +1,26 @@
+set hive.internal.ddl.list.bucketing.enable=true;
+set hive.optimize.skewjoin.compiletime = true;
+
+CREATE TABLE tmpT1(key STRING, val STRING) STORED AS TEXTFILE;
+
+LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE tmpT1;
+
+-- testing skew on other data types - int
+CREATE TABLE T1(key INT, val STRING) SKEWED BY (key) ON ((2));
+INSERT OVERWRITE TABLE T1 SELECT key, val FROM tmpT1;
+
+-- Tke skewed column is same in both the tables, however it is
+-- INT in one of the tables, and STRING in the other table
+
+CREATE TABLE T2(key STRING, val STRING)
+SKEWED BY (key) ON ((3)) STORED AS TEXTFILE;
+
+LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE T2;
+
+-- Once HIVE-3445 is fixed, the compile time skew join optimization would be
+-- applicable here. Till the above jira is fixed, it would be performed as a
+-- regular join
+EXPLAIN
+SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key;
+
+SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key;
Index: ql/src/test/queries/clientpositive/skewjoinopt2.q
===================================================================
--- ql/src/test/queries/clientpositive/skewjoinopt2.q (revision 0)
+++ ql/src/test/queries/clientpositive/skewjoinopt2.q (working copy)
@@ -0,0 +1,41 @@
+set hive.internal.ddl.list.bucketing.enable=true;
+set hive.optimize.skewjoin.compiletime = true;
+
+CREATE TABLE T1(key STRING, val STRING)
+SKEWED BY (key) ON ((2), (7)) STORED AS TEXTFILE;
+
+LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1;
+
+CREATE TABLE T2(key STRING, val STRING)
+SKEWED BY (key) ON ((3), (8)) STORED AS TEXTFILE;
+
+LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE T2;
+
+-- a simple query with skew on both the tables on the join key
+-- multiple skew values are present for the skewed keys
+-- but the skewed values do not overlap.
+-- The join values are a superset of the skewed keys.
+
+EXPLAIN
+SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key and a.val = b.val;
+
+SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key and a.val = b.val;
+
+-- test outer joins also
+
+EXPLAIN
+SELECT a.*, b.* FROM T1 a LEFT OUTER JOIN T2 b ON a.key = b.key and a.val = b.val;
+
+SELECT a.*, b.* FROM T1 a LEFT OUTER JOIN T2 b ON a.key = b.key and a.val = b.val;
+
+-- a group by at the end should not change anything
+
+EXPLAIN
+SELECT a.key, count(1) FROM T1 a JOIN T2 b ON a.key = b.key and a.val = b.val group by a.key;
+
+SELECT a.key, count(1) FROM T1 a JOIN T2 b ON a.key = b.key and a.val = b.val group by a.key;
+
+EXPLAIN
+SELECT a.key, count(1) FROM T1 a LEFT OUTER JOIN T2 b ON a.key = b.key and a.val = b.val group by a.key;
+
+SELECT a.key, count(1) FROM T1 a LEFT OUTER JOIN T2 b ON a.key = b.key and a.val = b.val group by a.key;
Index: ql/src/test/queries/clientpositive/skewjoinopt4.q
===================================================================
--- ql/src/test/queries/clientpositive/skewjoinopt4.q (revision 0)
+++ ql/src/test/queries/clientpositive/skewjoinopt4.q (working copy)
@@ -0,0 +1,25 @@
+set hive.internal.ddl.list.bucketing.enable=true;
+set hive.optimize.skewjoin.compiletime = true;
+
+CREATE TABLE T1(key STRING, val STRING)
+SKEWED BY (key) ON ((2)) STORED AS TEXTFILE;
+
+LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1;
+
+CREATE TABLE T2(key STRING, val STRING) STORED AS TEXTFILE;
+
+LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE T2;
+
+-- only of the tables of the join (the left table of the join) is skewed
+-- the skewed filter would still be applied to both the tables
+
+EXPLAIN
+SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key;
+
+SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key;
+
+-- the order of the join should not matter, just confirming
+EXPLAIN
+SELECT a.*, b.* FROM T2 a JOIN T1 b ON a.key = b.key;
+
+SELECT a.*, b.* FROM T2 a JOIN T1 b ON a.key = b.key;
Index: ql/src/test/queries/clientpositive/skewjoinopt6.q
===================================================================
--- ql/src/test/queries/clientpositive/skewjoinopt6.q (revision 0)
+++ ql/src/test/queries/clientpositive/skewjoinopt6.q (working copy)
@@ -0,0 +1,21 @@
+set hive.internal.ddl.list.bucketing.enable=true;
+set hive.optimize.skewjoin.compiletime = true;
+
+CREATE TABLE T1(key STRING, val STRING)
+SKEWED BY (key, val) ON ((2, 12), (8, 18)) STORED AS TEXTFILE;
+
+LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1;
+
+CREATE TABLE T2(key STRING, val STRING)
+SKEWED BY (key, val) ON ((3, 13), (8, 18)) STORED AS TEXTFILE;
+
+LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE T2;
+
+-- Both the join tables are skewed by 2 keys, and one of the skewed values
+-- is common to both the tables. The join key is a subset of the skewed key set:
+-- it only contains the first skewed key for both the tables
+
+EXPLAIN
+SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key;
+
+SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key;
Index: ql/src/test/queries/clientpositive/skewjoinopt8.q
===================================================================
--- ql/src/test/queries/clientpositive/skewjoinopt8.q (revision 0)
+++ ql/src/test/queries/clientpositive/skewjoinopt8.q (working copy)
@@ -0,0 +1,23 @@
+set hive.internal.ddl.list.bucketing.enable=true;
+set hive.optimize.skewjoin.compiletime = true;
+
+CREATE TABLE T1(key STRING, val STRING) STORED AS TEXTFILE;
+
+LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1;
+
+CREATE TABLE T2(key STRING, val STRING)
+SKEWED BY (key) ON ((3), (8)) STORED AS TEXTFILE;
+
+LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE T2;
+
+CREATE TABLE T3(key STRING, val STRING) STORED AS TEXTFILE;
+
+LOAD DATA LOCAL INPATH '../data/files/T3.txt' INTO TABLE T3;
+
+-- This test is for validating skewed join compile time optimization for more than
+-- 2 tables. The join key is the same, and so a 3-way join would be performed.
+-- 1 of the 3 tables are skewed on the join key
+EXPLAIN
+SELECT a.*, b.*, c.* FROM T1 a JOIN T2 b ON a.key = b.key JOIN T3 c on a.key = c.key;
+
+SELECT a.*, b.*, c.* FROM T1 a JOIN T2 b ON a.key = b.key JOIN T3 c on a.key = c.key;
Index: ql/src/test/queries/clientpositive/skewjoinopt11.q
===================================================================
--- ql/src/test/queries/clientpositive/skewjoinopt11.q (revision 0)
+++ ql/src/test/queries/clientpositive/skewjoinopt11.q (working copy)
@@ -0,0 +1,29 @@
+set hive.internal.ddl.list.bucketing.enable=true;
+set hive.optimize.skewjoin.compiletime = true;
+
+CREATE TABLE T1(key STRING, val STRING)
+SKEWED BY (key) ON ((2)) STORED AS TEXTFILE;
+
+LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1;
+
+CREATE TABLE T2(key STRING, val STRING) STORED AS TEXTFILE;
+
+LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE T2;
+
+-- This test is to verify the skew join compile optimization when the join is followed
+-- by a union. Both sides of a union consist of a join, which should have used
+-- skew join compile time optimization.
+EXPLAIN
+select * from
+(
+ select a.key, a.val as val1, b.val as val2 from T1 a join T2 b on a.key = b.key
+ union all
+ select a.key, a.val as val1, b.val as val2 from T1 a join T2 b on a.key = b.key
+) subq1;
+
+select * from
+(
+ select a.key, a.val as val1, b.val as val2 from T1 a join T2 b on a.key = b.key
+ union all
+ select a.key, a.val as val1, b.val as val2 from T1 a join T2 b on a.key = b.key
+) subq1;
Index: ql/src/test/queries/clientpositive/skewjoinopt20.q
===================================================================
--- ql/src/test/queries/clientpositive/skewjoinopt20.q (revision 0)
+++ ql/src/test/queries/clientpositive/skewjoinopt20.q (working copy)
@@ -0,0 +1,20 @@
+set hive.internal.ddl.list.bucketing.enable=true;
+set hive.optimize.skewjoin.compiletime = true;
+
+CREATE TABLE T1(key STRING, val STRING)
+CLUSTERED BY (key) SORTED BY (key) INTO 4 BUCKETS
+SKEWED BY (key) ON ((2)) STORED AS TEXTFILE;
+
+LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1;
+
+CREATE TABLE T2(key STRING, val STRING) STORED AS TEXTFILE;
+
+LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE T2;
+
+-- add a test where the skewed key is also the bucketized/sorted key
+-- it should not matter, and the compile time skewed join
+-- optimization is performed
+EXPLAIN
+SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key;
+
+SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key;
Index: ql/src/test/queries/clientpositive/skewjoinopt13.q
===================================================================
--- ql/src/test/queries/clientpositive/skewjoinopt13.q (revision 0)
+++ ql/src/test/queries/clientpositive/skewjoinopt13.q (working copy)
@@ -0,0 +1,33 @@
+set hive.internal.ddl.list.bucketing.enable=true;
+set hive.optimize.skewjoin.compiletime = true;
+
+CREATE TABLE T1(key STRING, val STRING) STORED AS TEXTFILE;
+
+LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1;
+
+CREATE TABLE T2(key STRING, val STRING) STORED AS TEXTFILE;
+
+LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE T2;
+
+CREATE TABLE T3(key STRING, val STRING)
+SKEWED BY (val) ON ((12)) STORED AS TEXTFILE;
+
+LOAD DATA LOCAL INPATH '../data/files/T3.txt' INTO TABLE T3;
+
+-- This test is for skewed join compile time optimization for more than 2 tables.
+-- The join key for table 3 is different from the join key used for joining
+-- tables 1 and 2. Table 3 is skewed, but since one of the join sources for table
+-- 3 consist of a sub-query which contains a join, the compile time skew join
+-- optimization is not performed
+
+EXPLAIN
+select *
+from
+T1 a join T2 b on a.key = b.key
+join T3 c on a.val = c.val;
+
+select *
+from
+T1 a join T2 b on a.key = b.key
+join T3 c on a.val = c.val;
+
Index: ql/src/test/queries/clientpositive/skewjoinopt15.q
===================================================================
--- ql/src/test/queries/clientpositive/skewjoinopt15.q (revision 0)
+++ ql/src/test/queries/clientpositive/skewjoinopt15.q (working copy)
@@ -0,0 +1,46 @@
+set hive.internal.ddl.list.bucketing.enable=true;
+set hive.optimize.skewjoin.compiletime = true;
+
+CREATE TABLE tmpT1(key STRING, val STRING) STORED AS TEXTFILE;
+
+LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE tmpT1;
+
+-- testing skew on other data types - int
+CREATE TABLE T1(key INT, val STRING) SKEWED BY (key) ON ((2));
+INSERT OVERWRITE TABLE T1 SELECT key, val FROM tmpT1;
+
+CREATE TABLE tmpT2(key STRING, val STRING) STORED AS TEXTFILE;
+
+LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE tmpT2;
+
+CREATE TABLE T2(key INT, val STRING) SKEWED BY (key) ON ((3));
+
+INSERT OVERWRITE TABLE T2 SELECT key, val FROM tmpT2;
+
+-- The skewed key is a integer column.
+-- Otherwise this test is similar to skewjoinopt1.q
+-- Both the joined tables are skewed, and the joined column
+-- is an integer
+EXPLAIN
+SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key;
+
+SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key;
+
+-- test outer joins also
+
+EXPLAIN
+SELECT a.*, b.* FROM T1 a RIGHT OUTER JOIN T2 b ON a.key = b.key;
+
+SELECT a.*, b.* FROM T1 a RIGHT OUTER JOIN T2 b ON a.key = b.key;
+
+-- an aggregation at the end should not change anything
+
+EXPLAIN
+SELECT count(1) FROM T1 a JOIN T2 b ON a.key = b.key;
+
+SELECT count(1) FROM T1 a JOIN T2 b ON a.key = b.key;
+
+EXPLAIN
+SELECT count(1) FROM T1 a RIGHT OUTER JOIN T2 b ON a.key = b.key;
+
+SELECT count(1) FROM T1 a RIGHT OUTER JOIN T2 b ON a.key = b.key;
Index: ql/src/test/queries/clientpositive/skewjoinopt17.q
===================================================================
--- ql/src/test/queries/clientpositive/skewjoinopt17.q (revision 0)
+++ ql/src/test/queries/clientpositive/skewjoinopt17.q (working copy)
@@ -0,0 +1,45 @@
+set hive.internal.ddl.list.bucketing.enable=true;
+set hive.optimize.skewjoin.compiletime = true;
+
+CREATE TABLE T1(key STRING, val STRING)
+SKEWED BY (key, val) ON ((2, 12)) STORED AS TEXTFILE;
+
+LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1;
+
+CREATE TABLE T2(key STRING, val STRING)
+SKEWED BY (key) ON ((2)) STORED AS TEXTFILE;
+
+LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE T2;
+
+-- One of the tables is skewed by 2 columns, and the other table is
+-- skewed by one column. Ths join is performed on the first skewed column
+-- The skewed value for the jon key is common to both the tables.
+-- In this case, the skewed join value is not repeated in the filter.
+
+EXPLAIN
+SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key;
+
+SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key;
+
+DROP TABLE T1;
+DROP TABLE T2;
+
+
+CREATE TABLE T1(key STRING, val STRING)
+SKEWED BY (key, val) ON ((2, 12)) STORED AS TEXTFILE;
+
+LOAD DATA LOCAL INPATH '../data/files/T1.txt' INTO TABLE T1;
+
+CREATE TABLE T2(key STRING, val STRING)
+SKEWED BY (key) ON ((2)) STORED AS TEXTFILE;
+
+LOAD DATA LOCAL INPATH '../data/files/T2.txt' INTO TABLE T2;
+
+-- One of the tables is skewed by 2 columns, and the other table is
+-- skewed by one column. Ths join is performed on the both the columns
+-- In this case, the skewed join value is repeated in the filter.
+
+EXPLAIN
+SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key and a.val = b.val;
+
+SELECT a.*, b.* FROM T1 a JOIN T2 b ON a.key = b.key and a.val = b.val;
Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/SkewJoinOptimizer.java
===================================================================
--- ql/src/java/org/apache/hadoop/hive/ql/optimizer/SkewJoinOptimizer.java (revision 0)
+++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/SkewJoinOptimizer.java (working copy)
@@ -0,0 +1,681 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.optimizer;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Map.Entry;
+import java.util.Set;
+import java.util.Stack;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.hive.ql.exec.JoinOperator;
+import org.apache.hadoop.hive.ql.exec.Operator;
+import org.apache.hadoop.hive.ql.exec.OperatorFactory;
+import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator;
+import org.apache.hadoop.hive.ql.exec.RowSchema;
+import org.apache.hadoop.hive.ql.exec.SelectOperator;
+import org.apache.hadoop.hive.ql.exec.TableScanOperator;
+import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
+import org.apache.hadoop.hive.ql.lib.DefaultGraphWalker;
+import org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher;
+import org.apache.hadoop.hive.ql.lib.Dispatcher;
+import org.apache.hadoop.hive.ql.lib.GraphWalker;
+import org.apache.hadoop.hive.ql.lib.Node;
+import org.apache.hadoop.hive.ql.lib.NodeProcessor;
+import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx;
+import org.apache.hadoop.hive.ql.lib.Rule;
+import org.apache.hadoop.hive.ql.lib.RuleRegExp;
+import org.apache.hadoop.hive.ql.metadata.Table;
+import org.apache.hadoop.hive.ql.parse.ParseContext;
+import org.apache.hadoop.hive.ql.parse.RowResolver;
+import org.apache.hadoop.hive.ql.parse.SemanticException;
+import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc;
+import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc;
+import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
+import org.apache.hadoop.hive.ql.plan.ExprNodeDesc.ExprNodeDescEqualityWrapper;
+import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc;
+import org.apache.hadoop.hive.ql.plan.FilterDesc;
+import org.apache.hadoop.hive.ql.plan.OperatorDesc;
+import org.apache.hadoop.hive.ql.plan.ReduceSinkDesc;
+import org.apache.hadoop.hive.ql.plan.SelectDesc;
+import org.apache.hadoop.hive.ql.plan.UnionDesc;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPAnd;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqual;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPNot;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPOr;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorConverters;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorConverters.Converter;
+import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
+import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;
+
+/**
+ * SkewJoinOptimizer.
+ *
+ */
+public class SkewJoinOptimizer implements Transform {
+
+ private static final Log LOG = LogFactory.getLog(SkewJoinOptimizer.class.getName());
+ private static ParseContext parseContext;
+
+ public static class SkewJoinProc implements NodeProcessor {
+ public SkewJoinProc() {
+ super();
+ }
+
+ @Override
+ public Object process(Node nd, Stack stack, NodeProcessorCtx procCtx,
+ Object... nodeOutputs) throws SemanticException {
+ // We should be having a tree which looks like this
+ // TS -> * -> RS -
+ // \
+ // -> JOIN -> ..
+ // /
+ // TS -> * -> RS -
+ //
+ // We are in the join operator now.
+
+ SkewJoinOptProcCtx ctx = (SkewJoinOptProcCtx) procCtx;
+ parseContext = ctx.getpGraphContext();
+
+ JoinOperator joinOp = (JoinOperator)nd;
+ // This join has already been processed
+ if (ctx.getDoneJoins().contains(joinOp)) {
+ return null;
+ }
+
+ ctx.getDoneJoins().add(joinOp);
+
+ Operator extends OperatorDesc> currOp = joinOp;
+ boolean processSelect = false;
+
+ // Is there a select following
+ // Clone the select also. It is useful for a follow-on optimization where the union
+ // followed by a select star is completely removed.
+ if ((joinOp.getChildOperators().size() == 1) &&
+ (joinOp.getChildOperators().get(0) instanceof SelectOperator)) {
+ currOp = joinOp.getChildOperators().get(0);
+ processSelect = true;
+ }
+
+ List tableScanOpsForJoin = new ArrayList();
+ if (!getTableScanOpsForJoin(joinOp, tableScanOpsForJoin)) {
+ return null;
+ }
+
+ if ((tableScanOpsForJoin == null) || (tableScanOpsForJoin.isEmpty())) {
+ return null;
+ }
+
+ // Get the skewed values in all the tables
+ Map, List>> skewedValues =
+ getSkewedValues(joinOp, tableScanOpsForJoin);
+
+ // If there are no skewed values, nothing needs to be done
+ if (skewedValues == null || skewedValues.size() == 0) {
+ return null;
+ }
+
+ // After this optimization, the tree should be like:
+ // TS -> (FIL "skewed rows") * -> RS -
+ // \
+ // -> JOIN
+ // / \
+ // TS -> (FIL "skewed rows") * -> RS - \
+ // \
+ // -> UNION -> ..
+ // /
+ // TS -> (FIL "no skewed rows") * -> RS - /
+ // \ /
+ // -> JOIN
+ // /
+ // TS -> (FIL "no skewed rows") * -> RS -
+ //
+
+ // Create a clone of the operator
+ Operator extends OperatorDesc> currOpClone;
+ try {
+ currOpClone = currOp.clone();
+ insertRowResolvers(currOp, currOpClone, ctx);
+ } catch (CloneNotSupportedException e) {
+ LOG.debug("Operator tree could not be cloned");
+ return null;
+ }
+
+ JoinOperator joinOpClone;
+ if (processSelect) {
+ joinOpClone = (JoinOperator)(currOpClone.getParentOperators().get(0));
+ } else {
+ joinOpClone = (JoinOperator)currOpClone;
+ }
+
+ // Put the filter "skewed column = skewed keys" in op
+ // and "skewed columns != skewed keys" in selectOpClone
+ insertSkewFilter(tableScanOpsForJoin, skewedValues, true);
+
+ List tableScanCloneOpsForJoin =
+ new ArrayList();
+ assert
+ getTableScanOpsForJoin(joinOpClone, tableScanCloneOpsForJoin);
+
+ insertSkewFilter(tableScanCloneOpsForJoin, skewedValues, false);
+
+ // Update the topOps appropriately
+ Map> topOps = getTopOps(joinOpClone);
+ Map> origTopOps = parseContext.getTopOps();
+
+ for (Entry> topOp : topOps.entrySet()) {
+ TableScanOperator tso = (TableScanOperator) topOp.getValue();
+ Table origTable = parseContext.getTopToTable().get(ctx.getCloneTSOpMap().get(tso));
+ String tabAlias = tso.getConf().getAlias();
+ parseContext.getTopToTable().put(tso, origTable);
+ int initCnt = 1;
+ String newAlias = "subquery" + initCnt + ":" + tabAlias;
+ while (origTopOps.containsKey(newAlias)) {
+ initCnt++;
+ newAlias = "subquery" + initCnt + ":" + tabAlias;
+ }
+
+ parseContext.getTopOps().put(newAlias, tso);
+ }
+
+ // Now do a union of the select operators: selectOp and selectOpClone
+ // Store the operator that follows the select after the join, we will be
+ // adding this as a child to the Union later
+ List> finalOps = currOp.getChildOperators();
+ currOp.setChildOperators(null);
+ currOpClone.setChildOperators(null);
+
+ // Make the union operator
+ List> oplist =
+ new ArrayList>();
+ oplist.add(currOp);
+ oplist.add(currOpClone);
+ Operator extends OperatorDesc> unionOp =
+ OperatorFactory.getAndMakeChild(
+ new UnionDesc(), new RowSchema(currOp.getSchema().getSignature()), oplist);
+
+ RowResolver unionRR = parseContext.getOpParseCtx().get(currOp).getRowResolver();
+ GenMapRedUtils.putOpInsertMap(unionOp, unionRR, parseContext);
+
+ // Introduce a select after the union
+ List> unionList =
+ new ArrayList>();
+ unionList.add(unionOp);
+
+ Operator extends OperatorDesc> selectUnionOp =
+ OperatorFactory.getAndMakeChild(
+ new SelectDesc(true),
+ new RowSchema(unionOp.getSchema().getSignature()), unionList);
+ GenMapRedUtils.putOpInsertMap(selectUnionOp, unionRR, parseContext);
+
+ // add the finalOp after the union
+ selectUnionOp.setChildOperators(finalOps);
+ // replace the original selectOp in the parents with selectUnionOp
+ for (Operator extends OperatorDesc> finalOp : finalOps) {
+ finalOp.replaceParent(currOp, selectUnionOp);
+ }
+ return null;
+ }
+
+ /*
+ * Get the list of table scan operators for this join. A interface supportSkewJoinOptimization
+ * has been provided. Currently, it is only enabled for simple filters and selects.
+ */
+ private boolean getTableScanOpsForJoin(
+ JoinOperator op,
+ List tsOps) {
+
+ for (Operator extends OperatorDesc> parent : op.getParentOperators()) {
+ if (!getTableScanOps(parent, tsOps)) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ private boolean getTableScanOps(
+ Operator extends OperatorDesc> op,
+ List tsOps) {
+ for (Operator extends OperatorDesc> parent : op.getParentOperators()) {
+ if (!parent.supportSkewJoinOptimization()) {
+ return false;
+ }
+
+ if (parent instanceof TableScanOperator) {
+ tsOps.add((TableScanOperator)parent);
+ } else if (!getTableScanOps(parent, tsOps)) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ /**
+ * Returns the skewed values in all the tables which are going to be scanned.
+ * If the join is on columns c1, c2 and c3 on tables T1 and T2,
+ * T1 is skewed on c1 and c4 with the skew values ((1,2),(3,4)),
+ * whereas T2 is skewed on c1, c2 with skew values ((5,6),(7,8)), the resulting
+ * map would be: <(c1) -> ((1), (3)), (c1,c2) -> ((5,6),(7,8))>
+ * @param op The join operator being optimized
+ * @param tableScanOpsForJoin table scan operators which are parents of the join operator
+ * @return map.
+ */
+ private Map, List>>
+ getSkewedValues(
+ Operator extends OperatorDesc> op, List tableScanOpsForJoin) {
+
+ Map , List>> skewDataReturn =
+ new HashMap, List>>();
+
+ Map , List>> skewData =
+ new HashMap, List>>();
+
+ // The join keys are available in the reduceSinkOperators before join
+ for (Operator extends OperatorDesc> reduceSinkOp : op.getParentOperators()) {
+ ReduceSinkDesc rsDesc = ((ReduceSinkOperator) reduceSinkOp).getConf();
+
+ if (rsDesc.getKeyCols() != null) {
+ Table table = null;
+ // Find the skew information corresponding to the table
+ List skewedColumns = null;
+ List> skewedValueList = null;
+
+ // The join columns which are also skewed
+ List joinKeysSkewedCols =
+ new ArrayList();
+
+ // skewed Keys which intersect with join keys
+ List positionSkewedKeys = new ArrayList();
+
+ // Update the joinKeys appropriately.
+ for (ExprNodeDesc keyColDesc : rsDesc.getKeyCols()) {
+ ExprNodeColumnDesc keyCol = null;
+
+ // If the key column is not a column, then dont apply this optimization.
+ // This will be fixed as part of https://issues.apache.org/jira/browse/HIVE-3445
+ // for type conversion UDFs.
+ if (keyColDesc instanceof ExprNodeColumnDesc) {
+ keyCol = (ExprNodeColumnDesc) keyColDesc;
+ if (table == null) {
+ table = getTable(parseContext, reduceSinkOp, tableScanOpsForJoin);
+ skewedColumns =
+ table == null ? null : table.getSkewedColNames();
+ // No skew on the table to take care of
+ if ((skewedColumns == null) || (skewedColumns.isEmpty())) {
+ continue;
+ }
+
+ skewedValueList =
+ table == null ? null : table.getSkewedColValues();
+ }
+ int pos = skewedColumns.indexOf(keyCol.getColumn());
+ if ((pos >= 0) && (!positionSkewedKeys.contains(pos))) {
+ positionSkewedKeys.add(pos);
+ ExprNodeColumnDesc keyColClone = (ExprNodeColumnDesc) keyCol.clone();
+ keyColClone.setTabAlias(null);
+ joinKeysSkewedCols.add(new ExprNodeDescEqualityWrapper(keyColClone));
+ }
+ }
+ }
+
+ // If the skew keys match the join keys, then add it to the list
+ if ((skewedColumns != null) && (!skewedColumns.isEmpty())) {
+ if (!joinKeysSkewedCols.isEmpty()) {
+ // If the join keys matches the skewed keys, use the table skewed keys
+ List> skewedJoinValues;
+ if (skewedColumns.size() == positionSkewedKeys.size()) {
+ skewedJoinValues = skewedValueList;
+ }
+ else {
+ skewedJoinValues =
+ getSkewedJoinValues(skewedValueList, positionSkewedKeys);
+ }
+
+ List> oldSkewedJoinValues =
+ skewData.get(joinKeysSkewedCols);
+ if (oldSkewedJoinValues == null) {
+ oldSkewedJoinValues = new ArrayList>();
+ }
+ for (List skewValue : skewedJoinValues) {
+ if (!oldSkewedJoinValues.contains(skewValue)) {
+ oldSkewedJoinValues.add(skewValue);
+ }
+ }
+
+ skewData.put(joinKeysSkewedCols, oldSkewedJoinValues);
+ }
+ }
+ }
+ }
+
+ // convert skewData to contain ExprNodeDesc in the keys
+ for (Map.Entry, List>> mapEntry :
+ skewData.entrySet()) {
+ List skewedKeyJoinCols = new ArrayList();
+ for (ExprNodeDescEqualityWrapper key : mapEntry.getKey()) {
+ skewedKeyJoinCols.add(key.getExprNodeDesc());
+ }
+ skewDataReturn.put(skewedKeyJoinCols, mapEntry.getValue());
+ }
+
+ return skewDataReturn;
+ }
+
+ /**
+ * Get the table alias from the candidate table scans.
+ */
+ private Table getTable(
+ ParseContext parseContext,
+ Operator extends OperatorDesc> op,
+ List tableScanOpsForJoin) {
+ while (true) {
+ if (op instanceof TableScanOperator) {
+ TableScanOperator tsOp = (TableScanOperator)op;
+ if (tableScanOpsForJoin.contains(tsOp)) {
+ return parseContext.getTopToTable().get(tsOp);
+ }
+ }
+ if ((op.getParentOperators() == null) || (op.getParentOperators().size() > 1)) {
+ return null;
+ }
+ op = op.getParentOperators().get(0);
+ }
+ }
+
+ /*
+ * If the skewedValues contains ((1,2,3),(4,5,6)), and the user is looking for
+ * positions (0,2), the result would be ((1,3),(4,6))
+ * Get the skewed key values that are part of the join key.
+ * @param skewedValuesList List of all the skewed values
+ * @param positionSkewedKeys the requested positions
+ * @return sub-list of skewed values with the positions present
+ */
+ private List> getSkewedJoinValues(
+ List> skewedValueList, List positionSkewedKeys) {
+ List> skewedJoinValues = new ArrayList>();
+ for (List skewedValuesAllColumns : skewedValueList) {
+ List skewedValuesSpecifiedColumns = new ArrayList();
+ for (int pos : positionSkewedKeys) {
+ skewedValuesSpecifiedColumns.add(skewedValuesAllColumns.get(pos));
+ }
+ skewedJoinValues.add(skewedValuesSpecifiedColumns);
+ }
+ return skewedJoinValues;
+ }
+
+ /**
+ * Inserts a filter comparing the join keys with the skewed keys. If the table
+ * is skewed with values (k1, v1) and (k2, v2) on columns (key, value), then
+ * filter ((key=k1 AND value=v1) OR (key=k2 AND value=v2)) is inserted. If @skewed
+ * is false, a NOT is inserted before it.
+ * @param tableScanOpsForJoin table scans for which the filter will be inserted
+ * @param skewedValuesList the map of
+ * @param skewed True if we want skewedCol = skewedValue, false if we want
+ * not (skewedCol = skewedValue)
+ */
+ private void insertSkewFilter(
+ List tableScanOpsForJoin,
+ Map, List>> skewedValuesList,
+ boolean skewed) {
+
+ ExprNodeDesc filterExpr = constructFilterExpr(skewedValuesList, skewed);
+ for (TableScanOperator tableScanOp : tableScanOpsForJoin) {
+ insertFilterOnTop(tableScanOp, filterExpr);
+ }
+ }
+
+ /**
+ * Inserts a filter below the table scan operator. Construct the filter
+ * from the filter expression provided.
+ * @param tableScanOp the table scan operators
+ * @param filterExpr the filter expression
+ */
+ private void insertFilterOnTop(
+ TableScanOperator tableScanOp,
+ ExprNodeDesc filterExpr) {
+
+ // Get the top operator and it's child, all operators have a single parent
+ Operator extends OperatorDesc> currChild = tableScanOp.getChildOperators().get(0);
+
+ // Create the filter Operator and update the parents and children appropriately
+ tableScanOp.setChildOperators(null);
+ currChild.setParentOperators(null);
+
+ Operator filter = OperatorFactory.getAndMakeChild(
+ new FilterDesc(filterExpr, false), tableScanOp);
+ filter.setSchema(new RowSchema(tableScanOp.getSchema().getSignature()));
+ OperatorFactory.makeChild(filter, currChild);
+
+ RowResolver filterRR = parseContext.getOpParseCtx().get(tableScanOp).getRowResolver();
+ GenMapRedUtils.putOpInsertMap(filter, filterRR, parseContext);
+ }
+
+ /**
+ * Construct the filter expression from the skewed keys and skewed values.
+ * If the skewed join keys are (k1), and (k1,k3) with the skewed values
+ * (1,2) and ((2,3),(4,5)) respectively, the filter expression would be:
+ * (k1=1) or (k1=2) or ((k1=2) and (k3=3)) or ((k1=4) and (k3=5)).
+ */
+ private ExprNodeDesc constructFilterExpr(
+ Map, List>> skewedValuesMap,
+ boolean skewed) {
+
+ ExprNodeDesc finalExprNodeDesc = null;
+ try {
+ for (Map.Entry, List>> mapEntry :
+ skewedValuesMap.entrySet()) {
+ List keyCols = mapEntry.getKey();
+ List> skewedValuesList = mapEntry.getValue();
+
+ for (List skewedValues : skewedValuesList) {
+ int keyPos = 0;
+ ExprNodeDesc currExprNodeDesc = null;
+
+ // Make the following condition: all the values match for all the columns
+ for (String skewedValue : skewedValues) {
+ List children = new ArrayList();
+
+ // We have ensured that the keys are columns
+ ExprNodeColumnDesc keyCol = (ExprNodeColumnDesc) keyCols.get(keyPos).clone();
+ keyPos++;
+ children.add(keyCol);
+
+ // Convert the constants available as strings to the corresponding objects
+ children.add(createConstDesc(skewedValue, keyCol));
+
+ ExprNodeGenericFuncDesc expr = null;
+ // Create the equality condition
+ expr = ExprNodeGenericFuncDesc.newInstance(new GenericUDFOPEqual(), children);
+ if (currExprNodeDesc == null) {
+ currExprNodeDesc = expr;
+ } else {
+ // If there are previous nodes, then AND the current node with the previous one
+ List childrenAND = new ArrayList();
+ childrenAND.add(currExprNodeDesc);
+ childrenAND.add(expr);
+ currExprNodeDesc =
+ ExprNodeGenericFuncDesc.newInstance(new GenericUDFOPAnd(), childrenAND);
+ }
+ }
+
+ // If there are more than one skewed values,
+ // then OR the current node with the previous one
+ if (finalExprNodeDesc == null) {
+ finalExprNodeDesc = currExprNodeDesc;
+ } else {
+ List childrenOR = new ArrayList();
+ childrenOR.add(finalExprNodeDesc);
+ childrenOR.add(currExprNodeDesc);
+
+ finalExprNodeDesc =
+ ExprNodeGenericFuncDesc.newInstance(new GenericUDFOPOr(), childrenOR);
+ }
+ }
+ }
+
+ // Add a NOT operator in the beginning (this is for the cloned operator because we
+ // want the values which are not skewed
+ if (skewed == false) {
+ List childrenNOT = new ArrayList();
+ childrenNOT.add(finalExprNodeDesc);
+ finalExprNodeDesc =
+ ExprNodeGenericFuncDesc.newInstance(new GenericUDFOPNot(), childrenNOT);
+ }
+ } catch (UDFArgumentException e) {
+ // Ignore the exception because we are not comparing Long vs. String here.
+ // There should never be an exception
+ assert false;
+ }
+ return finalExprNodeDesc;
+ }
+
+ /**
+ * Converts the skewedValue available as a string in the metadata to the appropriate object
+ * by using the type of the column from the join key.
+ * @param skewedValue
+ * @param keyCol
+ * @return an expression node descriptor of the appropriate constant
+ */
+ private ExprNodeConstantDesc createConstDesc(
+ String skewedValue, ExprNodeColumnDesc keyCol) {
+ ObjectInspector inputOI = TypeInfoUtils.getStandardJavaObjectInspectorFromTypeInfo(
+ TypeInfoFactory.stringTypeInfo);
+ ObjectInspector outputOI = TypeInfoUtils.getStandardJavaObjectInspectorFromTypeInfo(
+ keyCol.getTypeInfo());
+ Converter converter = ObjectInspectorConverters.getConverter(inputOI, outputOI);
+ Object skewedValueObject = converter.convert(skewedValue);
+ return new ExprNodeConstantDesc(keyCol.getTypeInfo(), skewedValueObject);
+ }
+
+ private Map> getTopOps(
+ Operator extends OperatorDesc> op) {
+ Map> topOps =
+ new HashMap>();
+ if (op.getParentOperators() == null || op.getParentOperators().size() == 0) {
+ topOps.put(((TableScanOperator)op).getConf().getAlias(), op);
+ } else {
+ for (Operator extends OperatorDesc> parent : op.getParentOperators()) {
+ if (parent != null) {
+ topOps.putAll(getTopOps(parent));
+ }
+ }
+ }
+ return topOps;
+ }
+
+ private void insertRowResolvers(
+ Operator extends OperatorDesc> op,
+ Operator extends OperatorDesc> opClone,
+ SkewJoinOptProcCtx ctx) {
+
+ if (op instanceof TableScanOperator) {
+ ctx.getCloneTSOpMap().put((TableScanOperator)opClone, (TableScanOperator)op);
+ }
+
+ GenMapRedUtils.putOpInsertMap(
+ opClone, parseContext.getOpParseCtx().get(op).getRowResolver(), parseContext);
+
+ List> parents = op.getParentOperators();
+ List> parentClones = opClone.getParentOperators();
+ if ((parents != null) && (!parents.isEmpty()) &&
+ (parentClones != null) && (!parentClones.isEmpty())) {
+ for (int pos = 0; pos < parents.size(); pos++) {
+ insertRowResolvers(parents.get(pos), parentClones.get(pos), ctx);
+ }
+ }
+ }
+ }
+
+ /* (non-Javadoc)
+ * @see org.apache.hadoop.hive.ql.optimizer.Transform#transform
+ * (org.apache.hadoop.hive.ql.parse.ParseContext)
+ */
+ @Override
+ public ParseContext transform(ParseContext pctx) throws SemanticException {
+ Map opRules = new LinkedHashMap();
+
+ opRules.put(new RuleRegExp("R1", "TS%.*RS%JOIN%"), getSkewJoinProc());
+ SkewJoinOptProcCtx skewJoinOptProcCtx = new SkewJoinOptProcCtx(pctx);
+ // The dispatcher fires the processor corresponding to the closest matching
+ // rule and passes the context along
+ Dispatcher disp = new DefaultRuleDispatcher(
+ null, opRules, skewJoinOptProcCtx);
+ GraphWalker ogw = new DefaultGraphWalker(disp);
+
+ // Create a list of topop nodes
+ List topNodes = new ArrayList();
+ topNodes.addAll(pctx.getTopOps().values());
+ ogw.startWalking(topNodes, null);
+ return pctx;
+ }
+
+ private NodeProcessor getSkewJoinProc() {
+ return new SkewJoinProc();
+ }
+
+ /**
+ * SkewJoinOptProcCtx.
+ *
+ */
+ public static class SkewJoinOptProcCtx implements NodeProcessorCtx {
+
+ private ParseContext pGraphContext;
+
+ // set of joins already processed
+ private Set doneJoins;
+ private Map cloneTSOpMap;
+
+ public SkewJoinOptProcCtx(ParseContext pctx) {
+ this.pGraphContext = pctx;
+ doneJoins = new HashSet();
+ cloneTSOpMap = new HashMap();
+ }
+
+ public ParseContext getpGraphContext() {
+ return pGraphContext;
+ }
+
+ public void setPGraphContext(ParseContext graphContext) {
+ pGraphContext = graphContext;
+ }
+
+ public Set getDoneJoins() {
+ return doneJoins;
+ }
+
+ public void setDoneJoins(Set doneJoins) {
+ this.doneJoins = doneJoins;
+ }
+
+ public Map getCloneTSOpMap() {
+ return cloneTSOpMap;
+ }
+
+ public void setCloneTSOpMap(Map cloneTSOpMap) {
+ this.cloneTSOpMap = cloneTSOpMap;
+ }
+ }
+}
Index: ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java
===================================================================
--- ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java (revision 1383248)
+++ ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java (working copy)
@@ -55,6 +55,9 @@
transformations.add(new PartitionPruner());
transformations.add(new PartitionConditionRemover());
}
+ if (HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVE_OPTIMIZE_SKEWJOIN_COMPILETIME)) {
+ transformations.add(new SkewJoinOptimizer());
+ }
if (HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVEOPTGBYUSINGINDEX)) {
transformations.add(new RewriteGBUsingIndex());
}
@@ -88,7 +91,7 @@
*/
public ParseContext optimize() throws SemanticException {
for (Transform t : transformations) {
- pctx = t.transform(pctx);
+ pctx = t.transform(pctx);
}
return pctx;
}
Index: ql/src/java/org/apache/hadoop/hive/ql/exec/JoinOperator.java
===================================================================
--- ql/src/java/org/apache/hadoop/hive/ql/exec/JoinOperator.java (revision 1383248)
+++ ql/src/java/org/apache/hadoop/hive/ql/exec/JoinOperator.java (working copy)
@@ -266,4 +266,11 @@
}
}
+ @Override
+ public boolean supportSkewJoinOptimization() {
+ // Since skew join optimization makes a copy of the tree above joins, and
+ // there is no multi-query optimization in place, let us not use skew join
+ // optimizations for now.
+ return false;
+ }
}
Index: ql/src/java/org/apache/hadoop/hive/ql/exec/FilterOperator.java
===================================================================
--- ql/src/java/org/apache/hadoop/hive/ql/exec/FilterOperator.java (revision 1383248)
+++ ql/src/java/org/apache/hadoop/hive/ql/exec/FilterOperator.java (working copy)
@@ -156,4 +156,9 @@
public OperatorType getType() {
return OperatorType.FILTER;
}
+
+ @Override
+ public boolean supportSkewJoinOptimization() {
+ return true;
+ }
}
Index: ql/src/java/org/apache/hadoop/hive/ql/exec/SelectOperator.java
===================================================================
--- ql/src/java/org/apache/hadoop/hive/ql/exec/SelectOperator.java (revision 1383248)
+++ ql/src/java/org/apache/hadoop/hive/ql/exec/SelectOperator.java (working copy)
@@ -96,4 +96,9 @@
public OperatorType getType() {
return OperatorType.SELECT;
}
+
+ @Override
+ public boolean supportSkewJoinOptimization() {
+ return true;
+ }
}
Index: ql/src/java/org/apache/hadoop/hive/ql/exec/TableScanOperator.java
===================================================================
--- ql/src/java/org/apache/hadoop/hive/ql/exec/TableScanOperator.java (revision 1383248)
+++ ql/src/java/org/apache/hadoop/hive/ql/exec/TableScanOperator.java (working copy)
@@ -274,4 +274,9 @@
}
}
}
+
+ @Override
+ public boolean supportSkewJoinOptimization() {
+ return true;
+ }
}
Index: ql/src/java/org/apache/hadoop/hive/ql/exec/Operator.java
===================================================================
--- ql/src/java/org/apache/hadoop/hive/ql/exec/Operator.java (revision 1383248)
+++ ql/src/java/org/apache/hadoop/hive/ql/exec/Operator.java (working copy)
@@ -1334,6 +1334,10 @@
public void cleanUpInputFileChangedOp() throws HiveException {
}
+ public boolean supportSkewJoinOptimization() {
+ return false;
+ }
+
@Override
public Operator extends OperatorDesc> clone()
throws CloneNotSupportedException {
Index: ql/src/java/org/apache/hadoop/hive/ql/exec/ReduceSinkOperator.java
===================================================================
--- ql/src/java/org/apache/hadoop/hive/ql/exec/ReduceSinkOperator.java (revision 1383248)
+++ ql/src/java/org/apache/hadoop/hive/ql/exec/ReduceSinkOperator.java (working copy)
@@ -39,9 +39,9 @@
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils;
+import org.apache.hadoop.hive.serde2.objectinspector.StandardUnionObjectInspector.StandardUnion;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.UnionObjectInspector;
-import org.apache.hadoop.hive.serde2.objectinspector.StandardUnionObjectInspector.StandardUnion;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
Index: ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeGenericFuncDesc.java
===================================================================
--- ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeGenericFuncDesc.java (revision 1383248)
+++ ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeGenericFuncDesc.java (working copy)
@@ -22,6 +22,7 @@
import java.util.ArrayList;
import java.util.List;
+import org.apache.commons.lang.builder.HashCodeBuilder;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
@@ -268,6 +269,15 @@
return true;
}
+ @Override
+ public int hashCode() {
+ int superHashCode = super.hashCode();
+ HashCodeBuilder builder = new HashCodeBuilder();
+ builder.appendSuper(superHashCode);
+ builder.append(childExprs);
+ return builder.toHashCode();
+ }
+
public boolean isSortedExpr() {
return isSortedExpr;
}
@@ -275,5 +285,4 @@
public void setSortedExpr(boolean isSortedExpr) {
this.isSortedExpr = isSortedExpr;
}
-
}
Index: ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeDesc.java
===================================================================
--- ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeDesc.java (revision 1383248)
+++ ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeDesc.java (working copy)
@@ -53,6 +53,11 @@
// object equality - isSame means that the objects are semantically equal.
public abstract boolean isSame(Object o);
+ @Override
+ public int hashCode() {
+ return typeInfo.hashCode();
+ }
+
public TypeInfo getTypeInfo() {
return typeInfo;
}
@@ -116,5 +121,10 @@
return this.exprNodeDesc.isSame(((ExprNodeDescEqualityWrapper)other).getExprNodeDesc());
}
+
+ @Override
+ public int hashCode() {
+ return exprNodeDesc == null ? 0 : exprNodeDesc.hashCode();
+ }
}
}
Index: ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeConstantDesc.java
===================================================================
--- ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeConstantDesc.java (revision 1383248)
+++ ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeConstantDesc.java (working copy)
@@ -20,6 +20,7 @@
import java.io.Serializable;
+import org.apache.commons.lang.builder.HashCodeBuilder;
import org.apache.hadoop.hive.serde.Constants;
import org.apache.hadoop.hive.serde2.objectinspector.ConstantObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory;
@@ -108,4 +109,13 @@
return true;
}
+
+ @Override
+ public int hashCode() {
+ int superHashCode = super.hashCode();
+ HashCodeBuilder builder = new HashCodeBuilder();
+ builder.appendSuper(superHashCode);
+ builder.append(value);
+ return builder.toHashCode();
+ }
}
Index: ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeFieldDesc.java
===================================================================
--- ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeFieldDesc.java (revision 1383248)
+++ ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeFieldDesc.java (working copy)
@@ -22,6 +22,7 @@
import java.util.ArrayList;
import java.util.List;
+import org.apache.commons.lang.builder.HashCodeBuilder;
import org.apache.hadoop.hive.ql.exec.Utilities;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
@@ -122,4 +123,15 @@
return true;
}
+
+ @Override
+ public int hashCode() {
+ int superHashCode = super.hashCode();
+ HashCodeBuilder builder = new HashCodeBuilder();
+ builder.appendSuper(superHashCode);
+ builder.append(desc);
+ builder.append(fieldName);
+ builder.append(isList);
+ return builder.toHashCode();
+ }
}
Index: ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeColumnDesc.java
===================================================================
--- ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeColumnDesc.java (revision 1383248)
+++ ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeColumnDesc.java (working copy)
@@ -22,6 +22,7 @@
import java.util.ArrayList;
import java.util.List;
+import org.apache.commons.lang.builder.HashCodeBuilder;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
@@ -156,4 +157,14 @@
public void setSkewedCol(boolean isSkewedCol) {
this.isSkewedCol = isSkewedCol;
}
+
+ @Override
+ public int hashCode() {
+ int superHashCode = super.hashCode();
+ HashCodeBuilder builder = new HashCodeBuilder();
+ builder.appendSuper(superHashCode);
+ builder.append(column);
+ builder.append(tabAlias);
+ return builder.toHashCode();
+ }
}
Index: ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java
===================================================================
--- ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java (revision 1383248)
+++ ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java (working copy)
@@ -8335,7 +8335,7 @@
/**
* This code is commented out pending further testing/development
- * for (Task extends SerializableCloneable> t: rootTasks)
+ * for (Task extends OperatorDesc> t: rootTasks)
* t.localizeMRTmpFiles(ctx);
*/
}